import requests
from bs4 import BeautifulSoup
import os
url = 'https://mp.weixin.qq.com/s/ynThL_y4Xi8U8LNtZrZ41Q'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
html = response.text
soup = BeautifulSoup(html, 'html.parser')
title_tag = soup.find('h1')
title = title_tag.get_text(strip=True) if title_tag else '未命名标题'
print(f'标题: {title}\n')
content_div = soup.find('div', id='js_content')
if content_div:
paragraphs = content_div.find_all(['p', 'section'])
article_text = ''
for p in paragraphs:
article_text += p.get_text(strip=True) + '\n'
print('正文内容:\n')
print(article_text)
with open(f'{title}.txt', 'w', encoding='utf-8') as f:
f.write(article_text)
img_tags = content_div.find_all('img') if content_div else []
img_urls = []
for img in img_tags:
img_url = img.get('data-src')
if img_url:
img_urls.append(img_url)
if img_urls:
img_folder = title + '_images'
os.makedirs(img_folder, exist_ok=True)
for idx, img_url in enumerate(img_urls):
img_data = requests.get(img_url, headers=headers).content
img_path = os.path.join(img_folder, f'image_{idx + 1}.jpg')
with open(img_path, 'wb') as img_file:
img_file.write(img_data)
print(f'共下载了 {len(img_urls)} 张图片,保存在文件夹: {img_folder}')
else:
print(f'请求失败,状态码: {response.status_code}')