编辑代码

import requests
from bs4 import BeautifulSoup
import time
import random
import json
import re
from datetime import datetime
import os
from urllib.parse import quote

class WechatArticleCrawler:
    def __init__(self):
        """初始化爬虫,设置请求头和基础URL"""
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Connection': 'keep-alive',
        }
        self.sogou_base_url = 'https://weixin.sogou.com/weixin'
        self.cache_dir = 'wechat_cache'
        os.makedirs(self.cache_dir, exist_ok=True)
    
    def search_official_account(self, account_name):
        """
        通过搜狗微信搜索公众号
        :param account_name: 公众号名称
        :return: 公众号唯一ID和名称
        """
        search_params = {
            'type': 1,
            'query': account_name,
            's_from': 'input',
            '_sug_': 'n',
            '_sug_type_': ''
        }
        
        # 检查是否有缓存
        cache_file = os.path.join(self.cache_dir, f"search_{quote(account_name)}.json")
        if os.path.exists(cache_file) and time.time() - os.path.getmtime(cache_file) < 86400:
            print(f"使用缓存的公众号搜索结果: {account_name}")
            with open(cache_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        
        try:
            response = requests.get(self.sogou_base_url, params=search_params, headers=self.headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 查找公众号结果
            account_items = soup.select('.news-box .news-list li')
            if not account_items:
                print(f"未找到公众号: {account_name}")
                return None
            
            # 获取第一个搜索结果
            first_item = account_items[0]
            account_info = {
                'account_name': first_item.select_one('.txt-box h3 a').text.strip(),
                'account_id': first_item.select_one('.account').text.strip(),
                'profile_url': first_item.select_one('.txt-box h3 a')['href']
            }
            
            # 保存缓存
            with open(cache_file, 'w', encoding='utf-8') as f:
                json.dump(account_info, f, ensure_ascii=False, indent=2)
                
            # 随机延时,避免过快请求
            time.sleep(random.uniform(1, 3))
            return account_info
            
        except Exception as e:
            print(f"搜索公众号时出错: {e}")
            return None
    
    def get_articles(self, account_name, max_articles=10):
        """
        获取公众号的文章列表
        :param account_name: 公众号名称
        :param max_articles: 最大获取文章数
        :return: 文章信息列表
        """
        # 先搜索公众号
        account_info = self.search_official_account(account_name)
        if not account_info:
            return []
            
        print(f"找到公众号: {account_info['account_name']} ({account_info['account_id']})")
        
        # 检查是否有缓存
        cache_file = os.path.join(self.cache_dir, f"articles_{quote(account_info['account_id'])}.json")
        if os.path.exists(cache_file) and time.time() - os.path.getmtime(cache_file) < 3600:
            print(f"使用缓存的文章列表: {account_name}")
            with open(cache_file, 'r', encoding='utf-8') as f:
                cached_articles = json.load(f)
                return cached_articles[:max_articles]
        
        try:
            # 获取公众号文章列表页面
            response = requests.get(account_info['profile_url'], headers=self.headers, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 提取文章信息
            articles = []
            article_items = soup.select('.weui_media_box.weui_media_appmsg')
            
            for item in article_items[:max_articles]:
                try:
                    title = item.select_one('.weui_media_title').text.strip()
                    link = item.select_one('.weui_media_title')['href']
                    if not link.startswith('http'):
                        link = 'https:' + link
                    
                    # 提取时间戳
                    timestamp = item.select_one('.weui_media_extra_info')['datetime']
                    # 转换时间戳为日期
                    publish_date = datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d')
                    
                    # 提取摘要
                    digest = item.select_one('.weui_media_desc').text.strip()
                    
                    articles.append({
                        'title': title,
                        'publish_date': publish_date,
                        'link': link,
                        'digest': digest
                    })
                except Exception as e:
                    print(f"解析文章时出错: {e}")
                    continue
            
            # 保存缓存
            with open(cache_file, 'w', encoding='utf-8') as f:
                json.dump(articles, f, ensure_ascii=False, indent=2)
                
            # 随机延时
            time.sleep(random.uniform(2, 5))
            
            return articles
            
        except Exception as e:
            print(f"获取文章列表时出错: {e}")
            return []

def main():
    """主函数,运行爬虫并输出结果"""
    crawler = WechatArticleCrawler()
    
    # 要抓取的公众号列表
    official_accounts = ['广州公交公安']
    
    all_articles = {}
    
    for account in official_accounts:
        print(f"\n正在抓取公众号: {account}")
        articles = crawler.get_articles(account, max_articles=5)
        all_articles[account] = articles
        
        # 输出结果
        print(f"{account} 最新文章:")
        for i, article in enumerate(articles, 1):
            print(f"{i}. {article['publish_date']} - {article['title']}")
            print(f"   链接: {article['link']}")
            print(f"   摘要: {article['digest'][:50]}...")
            print()
    
    # 保存所有结果到文件
    result_file = os.path.join(crawler.cache_dir, f"all_articles_{datetime.now().strftime('%Y%m%d')}.json")
    with open(result_file, 'w', encoding='utf-8') as f:
        json.dump(all_articles, f, ensure_ascii=False, indent=2)
    
    print(f"\n所有文章已保存到: {result_file}")

if __name__ == "__main__":
    main()