!usr/bin/env python -*-coding:utf-8 -*- __author__=Charles __date__=2018.03.16 import requests import json import re import time class Spider(): 初始化,记录采集时间 def __init__(self): self.time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print u\n,u开始采集数据,u\n本地时间:,self.time 获取data def getData(self,url): html=requests.get(url).text requests.adapters.DEFAULT_RETRIES=5 result=json.loads(html) data=result[data] return data 获取最新的评论id def getNew(self): data=self.getData(url=http://www.scuinfo.com/api/posts?pageSize=15) New=data[0][id] return New 提取data中有效数据,写入一个dict,多项写入一个list def getDetail(self): New=self.getNew() container=[] i=1 for id in range(131599,New+1): content={} self.url=http://www.scuinfo.com/api/post?id=+str(id) data=self.getData(url=self.url) if not isinstance(data,list): body=data.values()[7] likeCount=data.values()[6] comment=data.values()[0] 关键词分别为玻璃、杯、摔、观光 pattern=re.compile(u\u73bb\u7483|\u676f|\u6454|\u89c2\u5149,re.S) items=re.search(pattern,body) if items: content[body]=body content[like]=likeCount content[comment]=comment print u\n, i, u\n, u发言:, body, u\n, u点赞:, likeCount, u, u评论:, comment time.sleep(0.01) i += 1 container.append(content) else: print None print u\n\n, u至, self.time, u为止,info上关于玻璃杯事件,共有评论,i-1, u条 return container 获取评论总数 依据点赞数由大到小将评论排列,获取前100条热门评论 def getSort(self): container=self.getDetail() print u\n,u将人气更高的前100条打印如下: container.sort(key=lambda k:k.get(comment,0)) container.sort(key=lambda k:k.get(like,0),reverse=True) for index,r in enumerate(container): print u\n\n序号:,index+1, u\n发言:,r[body],u\n点赞: ,r[like],u评论,r[comment] spider=Spider() spider.getSort()