import requests
from bs4 import BeautifulSoup
import time
import random
import json
import re
from datetime import datetime
import os
from urllib.parse import quote
class WechatArticleCrawler:
def __init__(self):
"""初始化爬虫,设置请求头和基础URL"""
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
}
self.sogou_base_url = 'https://weixin.sogou.com/weixin'
self.cache_dir = 'wechat_cache'
os.makedirs(self.cache_dir, exist_ok=True)
def search_official_account(self, account_name):
"""
通过搜狗微信搜索公众号
:param account_name: 公众号名称
:return: 公众号唯一ID和名称
"""
search_params = {
'type': 1,
'query': account_name,
's_from': 'input',
'_sug_': 'n',
'_sug_type_': ''
}
cache_file = os.path.join(self.cache_dir, f"search_{quote(account_name)}.json")
if os.path.exists(cache_file) and time.time() - os.path.getmtime(cache_file) < 86400:
print(f"使用缓存的公众号搜索结果: {account_name}")
with open(cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
try:
response = requests.get(self.sogou_base_url, params=search_params, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
account_items = soup.select('.news-box .news-list li')
if not account_items:
print(f"未找到公众号: {account_name}")
return None
first_item = account_items[0]
account_info = {
'account_name': first_item.select_one('.txt-box h3 a').text.strip(),
'account_id': first_item.select_one('.account').text.strip(),
'profile_url': first_item.select_one('.txt-box h3 a')['href']
}
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(account_info, f, ensure_ascii=False, indent=2)
time.sleep(random.uniform(1, 3))
return account_info
except Exception as e:
print(f"搜索公众号时出错: {e}")
return None
def get_articles(self, account_name, max_articles=10):
"""
获取公众号的文章列表
:param account_name: 公众号名称
:param max_articles: 最大获取文章数
:return: 文章信息列表
"""
account_info = self.search_official_account(account_name)
if not account_info:
return []
print(f"找到公众号: {account_info['account_name']} ({account_info['account_id']})")
cache_file = os.path.join(self.cache_dir, f"articles_{quote(account_info['account_id'])}.json")
if os.path.exists(cache_file) and time.time() - os.path.getmtime(cache_file) < 3600:
print(f"使用缓存的文章列表: {account_name}")
with open(cache_file, 'r', encoding='utf-8') as f:
cached_articles = json.load(f)
return cached_articles[:max_articles]
try:
response = requests.get(account_info['profile_url'], headers=self.headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
article_items = soup.select('.weui_media_box.weui_media_appmsg')
for item in article_items[:max_articles]:
try:
title = item.select_one('.weui_media_title').text.strip()
link = item.select_one('.weui_media_title')['href']
if not link.startswith('http'):
link = 'https:' + link
timestamp = item.select_one('.weui_media_extra_info')['datetime']
publish_date = datetime.fromtimestamp(int(timestamp)).strftime('%Y-%m-%d')
digest = item.select_one('.weui_media_desc').text.strip()
articles.append({
'title': title,
'publish_date': publish_date,
'link': link,
'digest': digest
})
except Exception as e:
print(f"解析文章时出错: {e}")
continue
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
time.sleep(random.uniform(2, 5))
return articles
except Exception as e:
print(f"获取文章列表时出错: {e}")
return []
def main():
"""主函数,运行爬虫并输出结果"""
crawler = WechatArticleCrawler()
official_accounts = ['广州公交公安']
all_articles = {}
for account in official_accounts:
print(f"\n正在抓取公众号: {account}")
articles = crawler.get_articles(account, max_articles=5)
all_articles[account] = articles
print(f"{account} 最新文章:")
for i, article in enumerate(articles, 1):
print(f"{i}. {article['publish_date']} - {article['title']}")
print(f" 链接: {article['link']}")
print(f" 摘要: {article['digest'][:50]}...")
print()
result_file = os.path.join(crawler.cache_dir, f"all_articles_{datetime.now().strftime('%Y%m%d')}.json")
with open(result_file, 'w', encoding='utf-8') as f:
json.dump(all_articles, f, ensure_ascii=False, indent=2)
print(f"\n所有文章已保存到: {result_file}")
if __name__ == "__main__":
main()