diff --git a/BoLiBei.py b/BoLiBei.py index bae38cf..5994be3 100644 --- a/BoLiBei.py +++ b/BoLiBei.py @@ -1,71 +1,152 @@ #!usr/bin/env python -# -*-coding:utf-8 -*- -__author__='WYY' -__date__='2017.03.24' - -#实战小项目:爬取SCU-info玻璃杯事件,提取热门100条神回复 +# -*- coding: utf-8 -*- import requests import json import re import time +from requests.exceptions import RequestException, ConnectionError, Timeout + +__author__ = 'WYY' +__date__ = '2024.10.30' # 更新日期为当前优化时间 + + +class SCUInfoSpider: + """爬取SCU-info玻璃杯事件热门评论的爬虫类""" + # 关键词正则(匹配“玻璃”、“杯”、“摔”、“观光”) + KEYWORD_PATTERN = re.compile(u'\u73bb\u7483|\u676f|\u6454|\u89c2\u5149', re.S) + # 请求头(模拟浏览器,避免被反爬) + HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", + "Accept": "application/json, text/javascript, */*; q=0.01" + } + # 基础API地址 + BASE_URL = "http://www.scuinfo.com/api" -class Spider(): - #初始化,记录采集时间 def __init__(self): - self.time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - print u'\n',u'开始采集数据',u'\n本地时间:',self.time - - #获取data - def getData(self,url): - html=requests.get(url).text - requests.adapters.DEFAULT_RETRIES=5 - result=json.loads(html) - data=result['data'] - return data - - #获取最新的评论id - def getNew(self): - data=self.getData(url='http://www.scuinfo.com/api/posts?pageSize=15') - New=data[0]['id'] - return New - - #提取data中有效数据,写入一个dict,多项写入一个list - def getDetail(self): - New=self.getNew() - container=[] - i=1 - for id in range(131599,New+1): - content={} - self.url='http://www.scuinfo.com/api/post?id='+str(id) - data=self.getData(url=self.url) - if not isinstance(data,list): - body=data.values()[7] - likeCount=data.values()[6] - comment=data.values()[0] - #关键词分别为“玻璃”、“杯”、“摔”、“观光” - pattern=re.compile(u'\u73bb\u7483|\u676f|\u6454|\u89c2\u5149',re.S) - items=re.search(pattern,body) - if items: - content['body']=body - content['like']=likeCount - content['comment']=comment - print u'\n', i, u'\n', u'发言:', body, u'\n', u'点赞:', likeCount, u'', u'评论:', comment - time.sleep(0.01) - i += 1 - container.append(content) - else: - print 'None' - print u'\n\n', u'至', self.time, u'为止,info上关于玻璃杯事件,共有评论',i-1, u'条' + self.start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + self.session = requests.Session() # 复用会话,提高效率 + self.session.adapters.DEFAULT_RETRIES = 3 # 设置重试次数 + print(f"\n开始采集数据\n本地时间:{self.start_time}") + + def fetch_data(self, url, max_retries=2): + """ + 发送请求获取数据,带重试机制 + :param url: 请求地址 + :param max_retries: 最大重试次数 + :return: 解析后的JSON数据或None + """ + for retry in range(max_retries + 1): + try: + response = self.session.get( + url, + headers=self.HEADERS, + timeout=10 # 设置超时时间 + ) + response.raise_for_status() # 触发HTTP错误状态码异常 + return json.loads(response.text) + except (ConnectionError, Timeout): + if retry < max_retries: + print(f"网络异常,第{retry + 1}次重试...") + time.sleep(1) + continue + print(f"请求{url}失败:网络连接超时") + except RequestException as e: + print(f"请求{url}失败:{str(e)}") + except json.JSONDecodeError: + print(f"解析{url}返回的JSON失败") + return None + + def get_latest_post_id(self): + """获取最新帖子的ID""" + url = f"{self.BASE_URL}/posts?pageSize=15" + data = self.fetch_data(url) + if data and isinstance(data, list) and len(data) > 0: + return data[0].get('id') + raise ValueError("无法获取最新帖子ID,请检查API是否可用") + + def collect_relevant_posts(self, start_id=131599): + """ + 收集包含关键词的帖子 + :param start_id: 起始帖子ID + :return: 符合条件的帖子列表 + """ + try: + latest_id = self.get_latest_post_id() + except ValueError as e: + print(f"初始化失败:{e}") + return [] + + container = [] + total_scanned = 0 # 记录扫描的帖子总数 + print(f"开始扫描帖子(ID范围:{start_id}-{latest_id})...") + + for post_id in range(start_id, latest_id + 1): + total_scanned += 1 + url = f"{self.BASE_URL}/post?id={post_id}" + data = self.fetch_data(url) + + # 跳过无效数据 + if not data or isinstance(data, list): + if total_scanned % 50 == 0: # 每扫描50条打印一次进度 + print(f"已扫描{total_scanned}条帖子,当前ID:{post_id}") + continue + + # 提取帖子内容(使用get方法避免KeyError) + body = data.get('body', '') + like_count = data.get('likeCount', 0) + comment_count = data.get('commentCount', 0) # 原代码的comment可能对应此字段 + + # 匹配关键词 + if self.KEYWORD_PATTERN.search(body): + container.append({ + 'body': body, + 'like': like_count, + 'comment': comment_count + }) + print(f"\n发现匹配帖子 #{len(container)}" + f"\n内容:{body[:50]}..." # 只显示前50字避免过长 + f"\n点赞:{like_count} | 评论:{comment_count}") + + # 控制请求频率(避免给服务器过大压力) + time.sleep(0.1) # 延长间隔,原0.01秒太频繁 + + end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + print(f"\n扫描完成({start_id}-{latest_id})" + f"\n总扫描帖子:{total_scanned}条" + f"\n匹配关键词的帖子:{len(container)}条" + f"\n结束时间:{end_time}") return container - #获取评论总数 - #依据点赞数由大到小将评论排列,获取前100条热门评论 - def getSort(self): - container=self.getDetail() - print u'\n',u'将人气最高的前100条打印如下:' - container.sort(key=lambda k:k.get('comment',0)) - container.sort(key=lambda k:k.get('like',0),reverse=True) - for index,r in enumerate(container): - print u'\n\n序号:',index+1, u'\n发言:',r['body'],u'\n点赞:' ,r['like'],u'评论',r['comment'] -spider=Spider() -spider.getSort() + def get_top_comments(self, top_n=100): + """ + 获取按点赞排序的前N条热门评论 + :param top_n: 要返回的热门评论数量 + :return: 排序后的评论列表 + """ + posts = self.collect_relevant_posts() + if not posts: + print("没有找到符合条件的帖子") + return [] + + # 先按评论数升序,再按点赞数降序(优先点赞数) + sorted_posts = sorted( + posts, + key=lambda x: (x['comment'], -x['like']) + ) + # 取前N条 + top_posts = sorted_posts[:top_n] + + print(f"\n人气最高的前{len(top_posts)}条帖子:") + for idx, post in enumerate(top_posts, 1): + print(f"\n序号:{idx}" + f"\n内容:{post['body']}" + f"\n点赞:{post['like']} | 评论:{post['comment']}") + return top_posts + + +if __name__ == "__main__": + try: + spider = SCUInfoSpider() + spider.get_top_comments(top_n=100) + except Exception as e: + print(f"程序运行出错:{str(e)}") \ No newline at end of file