编辑代码

import requests
from bs4 import BeautifulSoup
import os

# 输入你想抓取的公众号文章链接(务必是https://mp.weixin.qq.com/s/... 这种格式)
url = 'https://mp.weixin.qq.com/s/ynThL_y4Xi8U8LNtZrZ41Q'

# 设置伪装浏览器的请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

# 发送请求
response = requests.get(url, headers=headers)

# 判断请求是否成功
if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    # 提取文章标题
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else '未命名标题'
    print(f'标题: {title}\n')

    # 提取文章正文内容
    content_div = soup.find('div', id='js_content')
    if content_div:
        paragraphs = content_div.find_all(['p', 'section'])
        article_text = ''
        for p in paragraphs:
            article_text += p.get_text(strip=True) + '\n'
        print('正文内容:\n')
        print(article_text)

        # 保存正文到本地txt文件
        with open(f'{title}.txt', 'w', encoding='utf-8') as f:
            f.write(article_text)

    # 提取文章中的图片
    img_tags = content_div.find_all('img') if content_div else []
    img_urls = []
    for img in img_tags:
        img_url = img.get('data-src')  # 微信文章图片通常在data-src属性里
        if img_url:
            img_urls.append(img_url)

    # 下载图片到本地
    if img_urls:
        img_folder = title + '_images'
        os.makedirs(img_folder, exist_ok=True)
        for idx, img_url in enumerate(img_urls):
            img_data = requests.get(img_url, headers=headers).content
            img_path = os.path.join(img_folder, f'image_{idx + 1}.jpg')
            with open(img_path, 'wb') as img_file:
                img_file.write(img_data)
        print(f'共下载了 {len(img_urls)} 张图片,保存在文件夹: {img_folder}')
else:
    print(f'请求失败,状态码: {response.status_code}')