python 单线程与多线程爬虫

来源:互联网 发布:windows home basic 编辑:程序博客网 时间:2024/05/18 02:33

帮别人写爬虫,先是单线程,太慢,改了多线程

1.单线程

import urllibimport urllib.requestimport requestsimport xlwtimport reimport string def set_style(name,height,bold=False):  style = xlwt.XFStyle() # 初始化样式   font = xlwt.Font() # 为样式创建字体  font.name = name # 'Times New Roman'  font.bold = bold  font.color_index = 4  font.height = height   style.font = font   return styleindex=0r=0wdk=xlwt.Workbook()       sheet1=wdk.add_sheet('sheet1',cell_overwrite_ok=True)  #excel单元格名字row=['招聘职位','月薪','学历要求','工作经验','应聘要求','工作单位']for i in row:    sheet1.write(r,index,i)    index+=1r+=1import datetimestarttime = datetime.datetime.now()for pn in range(0,200,20):    url = 'http://zhaopin.baidu.com/quanzhi?tid=4139&ie=utf8&oe=utf8&query=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98%E5%B7%A5%E7%A8%8B%E5%B8%88&city_sug=%E9%95%BF%E6%B2%99&&detailmode=close&rn=20&pn='+str(pn)#打开网页    wp = urllib.request.urlopen(url)    content = wp.read()    hrefPatten=r"(href='.{0,200}数据挖掘工程师')"    href=hrefPatten.encode("utf-8")    hrefC = re.findall(href, content, re.S)  #返回所有匹配正则表达式的值于列表中    hre=['']*len(hrefC)    for i in range (len(hrefC)):        hre[i]=urllib.parse.unquote(str(hrefC[i].decode('utf-8')).replace("href='","http://zhaopin.baidu.com")[:-1])#将href转化为网址    for i in range (len(hre)):        a=requests.get(hre[i])                content1=a.content.decode('utf-8')        pattern=('<span class="title line-clamp1">(.*?)</span>.*?class="salary">(.*?)</span>.*?class="xueli"></span>(.*?)</li>.*?class="minge"></span>(.*?)</li>.*?class="duty duty-box">(.*?)</p>.*?class="line-clamp2">(.*?)</p>')        items=re.findall(pattern, content1, re.S)#抓取招聘信息        for item in items:            flag=0            for _item in item:                if _item=="":                    flag=1            if flag==0:                for k in range(0,6):                    sheet1.write(r,k,item[k])                r+=1wdk.save('/home/y.xls')endtime = datetime.datetime.now()print((endtime - starttime).seconds)

时间我是80s左右

2.多线程

import urllibimport urllib.requestimport requestsimport xlwtimport reimport stringimport threadingfrom time import ctime,sleepdef set_style(name,height,bold=False):  style = xlwt.XFStyle() # 初始化样式   font = xlwt.Font() # 为样式创建字体  font.name = name # 'Times New Roman'  font.bold = bold  font.color_index = 4  font.height = height   style.font = font   return styleimport datetimestarttime = datetime.datetime.now()index=0global rr=0wdk=xlwt.Workbook()       sheet1=wdk.add_sheet('sheet1',cell_overwrite_ok=True)  #excel单元格名字row=['招聘职位','月薪','学历要求','工作经验','应聘要求','工作单位']for i in row:    sheet1.write(r,index,i)    index+=1r+=1inte=[]def f(pn):    url = 'http://zhaopin.baidu.com/quanzhi?tid=4139&ie=utf8&oe=utf8&query=%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98%E5%B7%A5%E7%A8%8B%E5%B8%88&city_sug=%E9%95%BF%E6%B2%99&&detailmode=close&rn=20&pn='+str(pn)#打开网页    wp = urllib.request.urlopen(url)    content = wp.read()    hrefPatten=r"(href='.{0,200}数据挖掘工程师')"    href=hrefPatten.encode("utf-8")    hrefC = re.findall(href, content, re.S)  #返回所有匹配正则表达式的值于列表中    hre=['']*len(hrefC)    for i in range (len(hrefC)):        hre[i]=urllib.parse.unquote(str(hrefC[i].decode('utf-8')).replace("href='","http://zhaopin.baidu.com")[:-1])#将href转化为网址    for i in range (len(hre)):        a=requests.get(hre[i])                content1=a.content.decode('utf-8')        pattern=('<span class="title line-clamp1">(.*?)</span>.*?class="salary">(.*?)</span>.*?class="xueli"></span>(.*?)</li>.*?class="minge"></span>(.*?)</li>.*?class="duty duty-box">(.*?)</p>.*?class="line-clamp2">(.*?)</p>')        items=re.findall(pattern, content1, re.S)#抓取招聘信息        for item in items:            flag=0            for _item in item:                if _item=="":                    flag=1            if flag==0:                inte.append(item)    endtime = datetime.datetime.now()    print((endtime - starttime).seconds)                threads = []for pn in range(0,200,20):    threads.append(threading.Thread(target=f,args=(pn,)))    sleep(0.1)import datetimestarttime = datetime.datetime.now()if __name__ == '__main__':    for t in threads:        t.setDaemon(True)        t.start()    sleep(15)    ind=1    for item in inte:        for k in range(0,6):            sheet1.write(ind,k,item[k])        ind+=1    wdk.save('/home/t.xls')        
15s完事