python爬虫(终)
来源:互联网 发布:达芬奇知乎 编辑:程序博客网 时间:2024/05/13 06:09
主函数:
#coding: utf-8'''Created on 2016年4月21日@author: Administrator'''import uuid'''多线程爬虫天涯杂谈爬取4月一个月的数据'''import requests,reimport jsonimport timeimport MySQLdbfrom sqlUtil2 import saveTopic,saveUser,saveRelation,saveCommentfrom multiprocessing.dummy import Pool as ThreadPoolglobal sdef getHtml(url): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'} html = requests.get(url,headers=headers,timeout=1)#s设置超时的时间1s html.encoding='utf-8' return htmldef getAttentionHtml(userId,pageNo): url='http://www.tianya.cn/api/tw' data={ 'method' :'following.ice.select', 'params.userId': userId, 'params.pageSize':'28', 'params.pageNo':pageNo } headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'} html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s html.encoding='utf-8' return htmldef getFansHtml(userId,pageNo): url='http://www.tianya.cn/api/tw' data={ 'method' :'follower.ice.select', 'params.userId': userId, 'params.pageSize':'28', 'params.pageNo':pageNo } headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'} html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s html.encoding='utf-8' return htmldef getContnetByReg(reg,text): return re.findall(reg, text, re.S)def getReplyData(url): reg=r'class="atl-item".+?class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?class="bbs-content">(.+?)</div>' dataList = getContnetByReg(reg, getHtml(url).text) return dataListdef getTopicData(url): reg=r'class="s_title".+?<span.+?>(.+?)</span>.+?div class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?class="atl-main".+?class="bbs-content clearfix">(.+?)</div>' dataList = getContnetByReg(reg, getHtml(url).text) return dataListdef getAuthorInfo(authorUrl): reg=r'class="relate-link".+?href="(.+?)">(.+?)</a>.+?href="(.+?)">(.+?)</a>' dataList = getContnetByReg(reg, getHtml(authorUrl).text) return dataListdef getAttentionList(userId,num): jsonstr = getAttentionHtml(userId,num).json() print getAttentionHtml(userId,num).text return jsonstr["data"]["user"]def getFansList(userId,num): jsonstr = getFansHtml(userId,num).json() print getFansHtml(userId,num).text return jsonstr["data"]["user"]def printFans(userId,num,username,conn): print '================粉丝=====================' if(num%28==0): x = num/28 else: x= num/28 + 1 #数据量太大 相对减少 http://www.tianya.cn/43178991/fans if(x>=200): x=x/10 for i in range(1,x+1): print '------第',i,'页------' fansList = getFansList(userId,i) for res in fansList: try: #保存关系 relationParams = (uuid.uuid4(),res["name"],username) saveRelation(relationParams, conn) except Exception,e: print 'failed!..','exception is: ',e try: #保存用户 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) authorUrl = 'http://www.tianya.cn/' + str(res["id"]) userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime) saveUser(userParams, conn) print res["id"],res["name"],res["followCount"],res["fansCount"] except Exception,e: print 'failed!..','exception is: ',edef printAttention(userId,num,username,conn): print '================关注的人=====================' if(num%28==0): x = num/28 else: x= num/28 + 1 print x for i in range(1,x+1): print '------第',i,'页------' attentList = getAttentionList(userId,i) for res in attentList: try: relationParams = (uuid.uuid4(),username,res["name"]) saveRelation(relationParams, conn) except Exception,e: print 'failed!..','exception is: ',e try: #保存用户 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) authorUrl = 'http://www.tianya.cn/' + str(res["id"]) userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime) saveUser(userParams, conn) except Exception,e: print 'failed!..','exception is: ',e print res["id"],res["name"],res["followCount"],res["fansCount"]def getTopicAllInfo(topicDataList,replyDataList,authorUrl,topiclink): conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='networkpublicopinionmap3',port=3306,charset='utf8') for topic in topicDataList: #得到发帖时间 postTime = topic[3].strip().split(':')[1] print '******',s print 'topiclink: ' , topiclink print 'topicId: ' , topiclink.split('-')[-2] print 'title: ',topic[0].strip() print 'authorLink: ',topic[1].strip() print 'authorName: ',topic[2].strip() print 'postTime: ',postTime print 'scanNum: ',topic[4].strip().split(':')[1] print 'replyNum: ',topic[5].strip().split(':')[1] print 'content: ',topic[6].strip() userId = topic[1].strip().split('/')[-1] infoList = getAuthorInfo(topic[1].strip()) #获取作者的信息(粉丝,关注等等) for info in infoList: print '\tattentionnNums: ',int(info[1].strip()) print '\tfansNum: ',int(info[3].strip()) try: #保存作者的信息 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) userparams = (userId,info[3].strip(),info[1].strip(),topic[2].strip(),topic[1].strip(),grabTime) saveUser(userparams, conn) except Exception,e: print 'failed!..','exception is: ',e if userId not in s: s.add(userId) if(int(info[1].strip())!=0): #保存关注的人和作者的关系 #保存关注人 printAttention(userId,int(info[1].strip()),topic[2].strip(),conn) if(int(info[3].strip())!=0): #保存粉丝和作者的关系 #保存粉丝 printFans(userId,int(info[3].strip()),topic[2].strip(),conn) try: #保存帖子 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) params = (topiclink.split('-')[-2],topiclink,topic[0].strip(),topic[6].strip(),topic[4].strip().split(':')[1],topic[5].strip().split(':')[1],topic[3].strip().split(':')[1],userId,grabTime) saveTopic(params,conn) except Exception,e: print 'saveTopic-failed!..','exception is: ',e for data in replyDataList: print 'replyerLink: ' , data[0].strip() print 'replyerName: ' , data[1].strip() print 'dateTime: ' , data[2].strip().split(':')[1] print 'content: ' , data[3].strip() replyerId = data[0].strip().split('/')[-1] infoList = getAuthorInfo(data[0].strip()) #获取作者的信息(粉丝,关注等等) for info in infoList: print '\tattentionnNums: ',info[1].strip() print '\tfansNum: ',info[3].strip() try: #保存作者的信息 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime) saveUser(relplyerparams, conn) except Exception,e: print 'failed!..','exception is: ',e if replyerId not in s: s.add(replyerId) if(int(info[1].strip())!=0): printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn) if(int(info[3].strip())!=0): printFans(replyerId,int(info[3].strip()),data[1].strip(),conn) try: #保存评论 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) commentParams = (uuid.uuid4(),data[3].strip(),data[2].strip().split(':')[1],topiclink.split('-')[-2],replyerId,grabTime) saveComment(commentParams,conn) except Exception,e: print 'failed!..','exception is: ',e conn.close();def getReplyAllInfo(topicDataList,replyDataList,authorUrl,topiclink): conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='networkpublicopinionmap3',port=3306,charset='utf8') print '............第二页的评论开始............' for data in replyDataList: print 'topiclink: ' , topiclink print 'replyerLink: ' , data[0].strip() print 'replyername: ' , data[1].strip() print 'dateTime: ' , data[2].strip().split(':')[1] print 'content: ' , data[3].strip() replyerId = data[0].strip().split('/')[-1] infoList = getAuthorInfo(data[0].strip()) #获取作者的信息(粉丝,关注等等) for info in infoList: print '\tattentionnNums: ',info[1].strip() print '\tfansNum: ',info[3].strip() try: #保存作者的信息 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime) saveUser(relplyerparams, conn) except Exception,e: print 'failed!..','exception is: ',e if replyerId not in s: s.add(replyerId) if(int(info[1].strip())!=0): printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn) if(int(info[3].strip())!=0): printFans(replyerId,int(info[3].strip()),data[1].strip(),conn) try: #保存评论 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) comment2Params = (uuid.uuid4(),data[3].strip(),data[2].strip().split(':')[1],topiclink.split('-')[-2],replyerId,grabTime) saveComment(comment2Params,conn) except Exception,e: print 'saveComment()-failed!..','exception is: ',e conn.close();def spider(url): originalUrl = 'http://bbs.tianya.cn' authorUrl = 'http://www.tianya.cn' reg=r'</tbody>(.+?)</table>' regLink = r'div class="links".+?</a>.+?href="(.+?)"' html = getHtml(url) nextLink = getContnetByReg(regLink, html.text) print 'nextLink: ', originalUrl + nextLink[0] n=1 while(nextLink[0]): print '...............第',n,'页..................' contentList = getContnetByReg(reg, html.text) for content in contentList: resreg = r'class="td-title faceblue">.+?href="(.+?)".+?(.+?)</' resultList = getContnetByReg(resreg, content); for result in resultList: #获取发帖时间 try: pageHtml = getHtml(originalUrl + result[0].strip()) postTimeReg=r'class="s_title".+?div class="atl-info".+?</a>.+?<span>(.+?)</span>'#判断postTimr的正则表达式 postTimeList = getContnetByReg(postTimeReg, pageHtml.text) postTime= postTimeList[0].strip().split(':')[1] print 'postTime: ',postTime if(postTime.startswith('2016-03')): print 'end..' return if(not postTime.startswith('2016-04')): print 'continue...' continue print 'start..' #获取帖子的信息 replyDataList = getReplyData(originalUrl + result[0].strip())# topicDataList = getTopicData(originalUrl + result[0].strip()) print '=================================================' #先判断有没有页码 isPageReg=r'class="atl-head".+?<div>(.+?)</div>'#判断页码的正则表达式 isPage = getContnetByReg(isPageReg, pageHtml.text) print 'isPage[0]: ',isPage[0].strip() #如果没有 页码直接得到帖子的相关信息 if(isPage[0].strip() == ''): #得到帖子的想关信息 getTopicAllInfo(topicDataList,replyDataList,authorUrl, originalUrl + result[0].strip()) #如果有页码 else: regPage = r'class="atl-pages">.+?</strong>.+?<(.+?)>'#判断当前页是不是尾页的正则表达式 flag = getContnetByReg(regPage, pageHtml.text) #判断页码终止的条件,输出所有页码下的信息 print 'flag: ', flag[0] #输出第一页的信息 print '----------------------------------第1页----------------------------------------------' getTopicAllInfo(topicDataList,replyDataList,authorUrl,originalUrl + result[0].strip()) #当前页的下页不为空时 i= 1 while(flag[0] != 'span'): i+=1 #获取当前页下页的链接 print '----------------------------------------第', i ,'页--------------------------------------------------' nextPageReg = r'class="atl-pages">.+?</strong>.+?href="(.+?)">'#判断当前页是不是尾页的正则表达式 nextPageLink = getContnetByReg(nextPageReg, pageHtml.text) print 'nextPageLink: ' , originalUrl + nextPageLink[0].strip() replynextPageList = getReplyData(originalUrl + nextPageLink[0].strip())#下页 nextPageHtml = getHtml(originalUrl + nextPageLink[0].strip()) #输出此页信息 getReplyAllInfo(topicDataList,replynextPageList,authorUrl,originalUrl + result[0].strip()) flag = getContnetByReg(regPage, nextPageHtml.text) except Exception,e: print 'failed!..','exception is: ',e n +=1 if n==2: url = 'http://bbs.tianya.cn' + nextLink[0] html = getHtml(url) nextLink = getContnetByReg(regLink, html.text) else: regLink2 = r'div class="links".+?</a>.+?</a>.+?href="(.+?)"' nextLink = getContnetByReg(regLink2, html.text) url = 'http://bbs.tianya.cn' + nextLink[0] try: html = getHtml(url) nextLink = getContnetByReg(regLink, html.text) except Exception , e: print '错误了!页面获取不到了! 'if __name__ == '__main__': # url = 'http://bbs.tianya.cn/list.jsp?item=develop&order=1' url = 'http://bbs.tianya.cn/list.jsp?item=free&order=1' page = [] s = set() #newpage = 'http://tieba.baidu.com/p/3522395718?pn=1' page.append(url) pool = ThreadPool(8) try: pool.map(spider, page) except Exception , e: print e finally: pool.close() pool.join()
数据库操作函数sqlUtil2.py
#coding: utf-8'''Created on 2016年4月27日@author: Administrator'''def saveTopic(params,conn): cur=conn.cursor() sql0 ='select ' sql = "insert into topic(topicId,website,title,content,scanNums,replyNums,postTime,userId,grabTime)\ values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" #保存数据库 cur.execute(sql,params) conn.commit() print '数据库插入成功 =====================================================Topic!' print '...................................................' cur.close()def saveUser(params,conn): cur=conn.cursor() sql = "insert into user(userId,fansCount,followCount,name,writerUrl,grabTime)\ values(%s,%s,%s,%s,%s,%s)" #保存数据库 cur.execute(sql,params) conn.commit() print '数据库插入成功=====================================================User!' print '...................................................' cur.close()def saveRelation(params,conn): cur=conn.cursor() sql = "insert into relation(id,userFrom,userTo)\ values(%s,%s,%s)" #保存数据库 cur.execute(sql,params) conn.commit() print '数据库插入成功--====================================================Relation!' print '...................................................' cur.close()def saveComment(params,conn): cur=conn.cursor() sql = "insert into comment(commentId,content,postTime,topicId,userId,grabTime)\ values(%s,%s,%s,%s,%s,%s)" #保存数据库 cur.execute(sql,params) conn.commit() print '数据库插入成功=======================================================Comment!' print '...................................................' cur.close()
0 0
- python爬虫(终)
- 【网络爬虫】【python】网络爬虫(一):python爬虫概述
- 网页爬虫(Python)
- Python爬虫(一)
- python 爬虫(1)
- python 爬虫(二)
- python 爬虫(三)
- Python爬虫(一)
- python爬虫(一)
- Python爬虫(一)
- python爬虫(一)
- python爬虫(一)
- python 爬虫(1)
- python爬虫(四)
- python爬虫(一)
- python爬虫(一)
- python爬虫(三)
- python爬虫(一)
- 《剑指offer》-求1+2+3+...+n
- 字符串中最长的回文字符串长度
- 好工具一个:通过class名称查找jar文件
- Android模拟器使用说明
- 题解: poj 1061 nefu 84(拓展欧几里得)
- python爬虫(终)
- Matlab + VS | Matlab2014a + VS2010 Win7 混编调试
- Ubuntu下设置环境变量及PATH的方法
- Java FutureTask正确使用姿势
- JPA学习笔记(16)——JPA高级
- 计算思维实践之路(三)
- Jquery选择器
- 困兽.Zoo
- Linux国内常用软件源的介绍