python爬虫（终）

来源：互联网发布：达芬奇知乎编辑：程序博客网时间：2024/05/13 06:09

主函数：

#coding: utf-8'''Created on 2016年4月21日@author: Administrator'''import uuid'''多线程爬虫天涯杂谈爬取4月一个月的数据'''import requests,reimport jsonimport timeimport MySQLdbfrom sqlUtil2 import saveTopic,saveUser,saveRelation,saveCommentfrom multiprocessing.dummy import Pool as ThreadPoolglobal sdef getHtml(url):    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}    html = requests.get(url,headers=headers,timeout=1)#s设置超时的时间1s    html.encoding='utf-8'    return htmldef getAttentionHtml(userId,pageNo):    url='http://www.tianya.cn/api/tw'    data={          'method' :'following.ice.select',          'params.userId': userId,          'params.pageSize':'28',          'params.pageNo':pageNo          }    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}    html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s    html.encoding='utf-8'    return htmldef getFansHtml(userId,pageNo):    url='http://www.tianya.cn/api/tw'    data={          'method' :'follower.ice.select',          'params.userId': userId,          'params.pageSize':'28',          'params.pageNo':pageNo          }    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}    html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s    html.encoding='utf-8'    return htmldef getContnetByReg(reg,text):    return re.findall(reg, text, re.S)def getReplyData(url):    reg=r'class="atl-item".+?class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?class="bbs-content">(.+?)</div>'    dataList = getContnetByReg(reg, getHtml(url).text)    return dataListdef getTopicData(url):    reg=r'class="s_title".+?<span.+?>(.+?)</span>.+?div class="atl-info".+?href="(.+?)".+?>(.+?)</a>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?<span>(.+?)</span>.+?class="atl-main".+?class="bbs-content clearfix">(.+?)</div>'    dataList = getContnetByReg(reg, getHtml(url).text)    return dataListdef getAuthorInfo(authorUrl):    reg=r'class="relate-link".+?href="(.+?)">(.+?)</a>.+?href="(.+?)">(.+?)</a>'    dataList = getContnetByReg(reg, getHtml(authorUrl).text)    return dataListdef getAttentionList(userId,num):    jsonstr = getAttentionHtml(userId,num).json()    print getAttentionHtml(userId,num).text    return jsonstr["data"]["user"]def getFansList(userId,num):    jsonstr = getFansHtml(userId,num).json()    print getFansHtml(userId,num).text    return jsonstr["data"]["user"]def printFans(userId,num,username,conn):    print '================粉丝====================='    if(num%28==0):        x = num/28    else:        x= num/28 + 1    #数据量太大 相对减少 http://www.tianya.cn/43178991/fans    if(x>=200):        x=x/10    for i in range(1,x+1):        print '------第',i,'页------'        fansList = getFansList(userId,i)        for res in fansList:            try:                #保存关系                relationParams = (uuid.uuid4(),res["name"],username)                saveRelation(relationParams, conn)            except Exception,e:                print 'failed!..','exception is: ',e            try:                #保存用户                ISOTIMEFORMAT='%Y-%m-%d %X'                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())                authorUrl = 'http://www.tianya.cn/' +  str(res["id"])                userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime)                saveUser(userParams, conn)                print res["id"],res["name"],res["followCount"],res["fansCount"]            except Exception,e:                print 'failed!..','exception is: ',edef printAttention(userId,num,username,conn):    print '================关注的人====================='    if(num%28==0):        x = num/28    else:        x= num/28 + 1    print x    for i in range(1,x+1):        print '------第',i,'页------'        attentList = getAttentionList(userId,i)        for res in attentList:            try:                relationParams = (uuid.uuid4(),username,res["name"])                saveRelation(relationParams, conn)            except Exception,e:                print 'failed!..','exception is: ',e            try:                #保存用户                ISOTIMEFORMAT='%Y-%m-%d %X'                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())                authorUrl = 'http://www.tianya.cn/' + str(res["id"])                userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime)                saveUser(userParams, conn)            except Exception,e:                print 'failed!..','exception is: ',e            print res["id"],res["name"],res["followCount"],res["fansCount"]def getTopicAllInfo(topicDataList,replyDataList,authorUrl,topiclink):    conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='networkpublicopinionmap3',port=3306,charset='utf8')    for topic in topicDataList:        #得到发帖时间        postTime = topic[3].strip().split('：')[1]        print '******',s        print 'topiclink: ' , topiclink        print 'topicId: ' , topiclink.split('-')[-2]        print 'title: ',topic[0].strip()        print 'authorLink: ',topic[1].strip()        print 'authorName: ',topic[2].strip()        print 'postTime: ',postTime        print 'scanNum: ',topic[4].strip().split('：')[1]        print 'replyNum: ',topic[5].strip().split('：')[1]        print 'content: ',topic[6].strip()        userId = topic[1].strip().split('/')[-1]        infoList = getAuthorInfo(topic[1].strip()) #获取作者的信息（粉丝，关注等等）        for info in infoList:            print '\tattentionnNums: ',int(info[1].strip())            print '\tfansNum: ',int(info[3].strip())            try:                #保存作者的信息                ISOTIMEFORMAT='%Y-%m-%d %X'                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())                userparams = (userId,info[3].strip(),info[1].strip(),topic[2].strip(),topic[1].strip(),grabTime)                saveUser(userparams, conn)            except Exception,e:                print 'failed!..','exception is: ',e            if userId not in s:                s.add(userId)                if(int(info[1].strip())!=0):                    #保存关注的人和作者的关系                    #保存关注人                    printAttention(userId,int(info[1].strip()),topic[2].strip(),conn)                if(int(info[3].strip())!=0):                    #保存粉丝和作者的关系                    #保存粉丝                    printFans(userId,int(info[3].strip()),topic[2].strip(),conn)        try:            #保存帖子            ISOTIMEFORMAT='%Y-%m-%d %X'            grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())            params = (topiclink.split('-')[-2],topiclink,topic[0].strip(),topic[6].strip(),topic[4].strip().split('：')[1],topic[5].strip().split('：')[1],topic[3].strip().split('：')[1],userId,grabTime)            saveTopic(params,conn)        except Exception,e:            print 'saveTopic-failed!..','exception is: ',e    for data in replyDataList:        print 'replyerLink: ' , data[0].strip()        print 'replyerName: ' , data[1].strip()        print 'dateTime: ' , data[2].strip().split('：')[1]        print 'content: ' , data[3].strip()        replyerId = data[0].strip().split('/')[-1]        infoList = getAuthorInfo(data[0].strip()) #获取作者的信息（粉丝，关注等等）        for info in infoList:            print '\tattentionnNums: ',info[1].strip()            print '\tfansNum: ',info[3].strip()            try:                #保存作者的信息                ISOTIMEFORMAT='%Y-%m-%d %X'                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())                relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime)                saveUser(relplyerparams, conn)            except Exception,e:                print 'failed!..','exception is: ',e            if replyerId not in s:                s.add(replyerId)                if(int(info[1].strip())!=0):                    printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn)                if(int(info[3].strip())!=0):                    printFans(replyerId,int(info[3].strip()),data[1].strip(),conn)        try:            #保存评论            ISOTIMEFORMAT='%Y-%m-%d %X'            grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())            commentParams = (uuid.uuid4(),data[3].strip(),data[2].strip().split('：')[1],topiclink.split('-')[-2],replyerId,grabTime)            saveComment(commentParams,conn)        except Exception,e:            print 'failed!..','exception is: ',e    conn.close();def getReplyAllInfo(topicDataList,replyDataList,authorUrl,topiclink):    conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='networkpublicopinionmap3',port=3306,charset='utf8')    print '............第二页的评论开始............'    for data in replyDataList:        print 'topiclink: ' , topiclink        print 'replyerLink: ' , data[0].strip()        print 'replyername: ' , data[1].strip()        print 'dateTime: ' , data[2].strip().split('：')[1]        print 'content: ' , data[3].strip()        replyerId = data[0].strip().split('/')[-1]        infoList = getAuthorInfo(data[0].strip()) #获取作者的信息（粉丝，关注等等）        for info in infoList:            print '\tattentionnNums: ',info[1].strip()            print '\tfansNum: ',info[3].strip()            try:                #保存作者的信息                ISOTIMEFORMAT='%Y-%m-%d %X'                grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())                relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime)                saveUser(relplyerparams, conn)            except Exception,e:                print 'failed!..','exception is: ',e            if replyerId not in s:                s.add(replyerId)                if(int(info[1].strip())!=0):                    printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn)                if(int(info[3].strip())!=0):                    printFans(replyerId,int(info[3].strip()),data[1].strip(),conn)        try:            #保存评论            ISOTIMEFORMAT='%Y-%m-%d %X'            grabTime = time.strftime(ISOTIMEFORMAT, time.localtime())            comment2Params = (uuid.uuid4(),data[3].strip(),data[2].strip().split('：')[1],topiclink.split('-')[-2],replyerId,grabTime)            saveComment(comment2Params,conn)        except Exception,e:            print 'saveComment()-failed!..','exception is: ',e    conn.close();def spider(url):    originalUrl = 'http://bbs.tianya.cn'    authorUrl = 'http://www.tianya.cn'    reg=r'</tbody>(.+?)</table>'    regLink = r'div class="links".+?</a>.+?href="(.+?)"'    html = getHtml(url)    nextLink = getContnetByReg(regLink, html.text)    print 'nextLink: ', originalUrl + nextLink[0]    n=1    while(nextLink[0]):        print '...............第',n,'页..................'        contentList = getContnetByReg(reg, html.text)        for content in contentList:            resreg  = r'class="td-title faceblue">.+?href="(.+?)".+?(.+?)</'            resultList = getContnetByReg(resreg, content);            for result in resultList:                #获取发帖时间                try:                    pageHtml = getHtml(originalUrl + result[0].strip())                    postTimeReg=r'class="s_title".+?div class="atl-info".+?</a>.+?<span>(.+?)</span>'#判断postTimr的正则表达式                    postTimeList = getContnetByReg(postTimeReg, pageHtml.text)                    postTime= postTimeList[0].strip().split('：')[1]                    print 'postTime: ',postTime                    if(postTime.startswith('2016-03')):                        print 'end..'                        return                    if(not postTime.startswith('2016-04')):                        print 'continue...'                        continue                    print 'start..'                    #获取帖子的信息                    replyDataList = getReplyData(originalUrl + result[0].strip())#                    topicDataList = getTopicData(originalUrl + result[0].strip())                    print '================================================='                    #先判断有没有页码                    isPageReg=r'class="atl-head".+?<div>(.+?)</div>'#判断页码的正则表达式                    isPage = getContnetByReg(isPageReg, pageHtml.text)                    print 'isPage[0]: ',isPage[0].strip()                    #如果没有 页码直接得到帖子的相关信息                    if(isPage[0].strip() == ''):                        #得到帖子的想关信息                        getTopicAllInfo(topicDataList,replyDataList,authorUrl, originalUrl + result[0].strip())                    #如果有页码                    else:                        regPage = r'class="atl-pages">.+?</strong>.+?<(.+?)>'#判断当前页是不是尾页的正则表达式                        flag = getContnetByReg(regPage, pageHtml.text)                        #判断页码终止的条件，输出所有页码下的信息                        print 'flag: ', flag[0]                        #输出第一页的信息                        print '----------------------------------第1页----------------------------------------------'                        getTopicAllInfo(topicDataList,replyDataList,authorUrl,originalUrl + result[0].strip())                        #当前页的下页不为空时                        i= 1                        while(flag[0] != 'span'):                            i+=1                            #获取当前页下页的链接                            print '----------------------------------------第', i ,'页--------------------------------------------------'                            nextPageReg = r'class="atl-pages">.+?</strong>.+?href="(.+?)">'#判断当前页是不是尾页的正则表达式                            nextPageLink = getContnetByReg(nextPageReg, pageHtml.text)                            print 'nextPageLink: ' , originalUrl + nextPageLink[0].strip()                            replynextPageList = getReplyData(originalUrl + nextPageLink[0].strip())#下页                            nextPageHtml = getHtml(originalUrl + nextPageLink[0].strip())                            #输出此页信息                            getReplyAllInfo(topicDataList,replynextPageList,authorUrl,originalUrl + result[0].strip())                            flag = getContnetByReg(regPage, nextPageHtml.text)                except Exception,e:                    print 'failed!..','exception is: ',e        n +=1        if n==2:            url = 'http://bbs.tianya.cn' + nextLink[0]            html = getHtml(url)            nextLink = getContnetByReg(regLink, html.text)        else:            regLink2 = r'div class="links".+?</a>.+?</a>.+?href="(.+?)"'            nextLink = getContnetByReg(regLink2, html.text)            url = 'http://bbs.tianya.cn' + nextLink[0]            try:                html = getHtml(url)                nextLink = getContnetByReg(regLink, html.text)            except Exception , e:                print '错误了！页面获取不到了！ 'if __name__ == '__main__':   # url = 'http://bbs.tianya.cn/list.jsp?item=develop&order=1'    url = 'http://bbs.tianya.cn/list.jsp?item=free&order=1'    page = []    s = set()    #newpage = 'http://tieba.baidu.com/p/3522395718?pn=1'    page.append(url)    pool = ThreadPool(8)    try:        pool.map(spider, page)    except Exception , e:        print e    finally:        pool.close()        pool.join()

数据库操作函数sqlUtil2.py

#coding: utf-8'''Created on 2016年4月27日@author: Administrator'''def saveTopic(params,conn):    cur=conn.cursor()    sql0 ='select '    sql = "insert into topic(topicId,website,title,content,scanNums,replyNums,postTime,userId,grabTime)\             values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"     #保存数据库    cur.execute(sql,params)    conn.commit()    print '数据库插入成功 =====================================================Topic!'    print '...................................................'    cur.close()def saveUser(params,conn):    cur=conn.cursor()    sql = "insert into user(userId,fansCount,followCount,name,writerUrl,grabTime)\             values(%s,%s,%s,%s,%s,%s)"     #保存数据库    cur.execute(sql,params)    conn.commit()    print '数据库插入成功=====================================================User!'    print '...................................................'    cur.close()def saveRelation(params,conn):    cur=conn.cursor()    sql = "insert into relation(id,userFrom,userTo)\             values(%s,%s,%s)"     #保存数据库    cur.execute(sql,params)    conn.commit()    print '数据库插入成功--====================================================Relation!'    print '...................................................'    cur.close()def saveComment(params,conn):    cur=conn.cursor()    sql = "insert into comment(commentId,content,postTime,topicId,userId,grabTime)\             values(%s,%s,%s,%s,%s,%s)"     #保存数据库    cur.execute(sql,params)    conn.commit()    print '数据库插入成功=======================================================Comment!'    print '...................................................'    cur.close()

0 0