人民日报语料库抓取python实现（二）--多线程

来源：互联网发布：网络借贷平台排行榜编辑：程序博客网时间：2024/05/22 16:03

由于有大量的IO，多线程可以提高爬取的效率。出于不同队列存储不同url和对于爬虫进行分工的初衷，这里实现了两个队列shareMonthQueue和shareReportQueue。其中shareMonthQueue存储所有月份初始url和包含的其他页面（一个月份有很多page，例：1946年5月包含30个page）。shareReportQueue存储所有新闻的url。两个队列有其专用的爬虫monthSpider和reportSpider。师兄说：从操作系统的角度来看，两个队列是多此一举，增加代码复杂度，并不提高效率。我想了想，师兄说的对。

上代码：

#coding:utf-8#author:zhangyang#date:2015-5-21#此程序用于爬取人民日报下的数据资源。主页面需要提取包括1946年到2003年之间所有月份#次级页面是各个月份的所有报道#末级页面是报道内容#使用多线程提高爬取效率import urllib2,bs4,os,refrom time import clockimport threading,Queue#关于bs4解析url的方法可以参看：http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.htmlstarturl="http://rmrbw.info/"shareMonthQueue=Queue.Queue()  #存储月份url的公共队列shareReportQueue=Queue.Queue() #c存储新闻url的公共队列_WORK_MONTH_THREAD_NUM=3       #用于处理月份url的爬虫数量_WORK_REPORT_THREAD_NUM_=10    #用于处理新闻url的爬虫数量totalNum=0  #全局计数器mutex=threading.Lock() #互斥锁tlist=[]<span style="white-space:pre"></span>#线程列表t1=clock()t2=clock()t3=clock()t4=clock()class monthSplider(threading.Thread):def __init__(self,name,dicPath = os.getcwd()+os.path.sep+"data"+os.path.sep):threading.Thread.__init__(self)self.name=nameself.dicPath=dicPathself.TIMEOUT=10def run(self):start=clock()end=clock()while True:if shareMonthQueue.empty()==False:start=clock()monthurl=shareMonthQueue.get()try:page=urllib2.urlopen(monthurl).read()soup=bs4.BeautifulSoup(''.join(page),'lxml')except Exception as e:print "loading url error at line 43"print econtinuetitle=soup.find('a','fl')   #找到年月的标签位置month=title.contents[0]curpath=os.getcwd()#print month.encode('utf8')datapath=self.dicPath+month.encode('gbk')if os.path.exists(datapath)==False:os.mkdir(datapath)                       #创建好当月文件夹pages=soup.find('div','pages').contents[-1]totalpage=pages.split(' ')[3].split('/')[1]   #得到总页面数templist=monthurl.split('=')curpage=templist[-1]curpage=int(curpage.strip())              #得到当前页面值#判断如果curpage小于totalpage，则把curpage+1得到下一个页面放入shareMonthQueue中if curpage<totalpage:templist[-1]=str(curpage+1)nexturl='='.join(templist)shareMonthQueue.put(nexturl)#获取当前页面所有新闻的url,并把url放入shareReportQueue里res=soup.find_all(id=re.compile("a_ajax_"))for item in res:shareReportQueue.put(starturl+item['href'])else:#在shareMonthQueue为空的情况下等待TIMEOUT秒后退出end=clock()if (end-start)>self.TIMEOUT:breakclass reportSpider(threading.Thread):def __init__(self,name,dicPath = os.getcwd()+os.path.sep+"data"+os.path.sep):threading.Thread.__init__(self)self.name=nameself.dicPath=dicPathself.TIMEOUT=10def run(self):start=clock()end=clock()while True:if shareReportQueue.empty()==False:start=clock()url=shareReportQueue.get()try:page=urllib2.urlopen(url).read()soup=bs4.BeautifulSoup(''.join(page),'lxml')except Exception as e:print "loading url error at line 93"print econtinuemonth=soup.find('a',href=re.compile('thread.php')).get_text().strip() #解析当前网页所在年月month=month.encode('gbk')title=soup.find('h1','fl').get_text() #解析当前网页的新闻标题title=title.strip().split(' ')[0]#print title.encode('utf8')cont_div=soup.find('div','tpc_content')cont=cont_div.get_text().strip()   #解析当前网页的新闻内容title=title.encode('gbk')cont=cont.encode('gbk')try:filename=self.dicPath+month+os.path.sep+title+'.txt'f=open(filename,'w')f.write(cont)except Exception as e:print str(e)+self.namecontinueglobal totalNumglobal mutexif mutex.acquire(1):totalNum+=1mutex.release()#print self.name+"处理了一个页面"if totalNum%100==0:global t3,t4t4=clock()print "已处理了"+str(totalNum)+"条数据,用时"+str(t4-t3)+'s'else:end=clock()if (end-start)>self.TIMEOUT:breakdef main():global t1,t2,t3,t4t1=clock()pape=urllib2.urlopen(starturl)mainsoup=bs4.BeautifulSoup(''.join(pape),'lxml')alist=mainsoup.find_all('a',class_='fnamecolor',limit=10)for item in alist:monthurl=item['href']+'&page=1'shareMonthQueue.put(starturl+monthurl)t2=clock()print "主页面爬取完成，用时"+str(t2-t1)+'s'for i in xrange(_WORK_REPORT_THREAD_NUM_):if i<_WORK_MONTH_THREAD_NUM:ms=monthSplider('ms'+str(i))tlist.append(ms)rs=reportSpider('rs'+str(i))tlist.append(rs)t3=clock()print "爬虫准备就绪,用时"+str(t3-t2)+'s'for t in tlist:t.start()for t in tlist:t.join()if __name__=="__main__":main()

0 0