python学习之使用multiprocessing.dummy多线程爬虫

来源:互联网 发布:比特彗星端口设置 编辑:程序博客网 时间:2024/05/16 19:07
#-*- coding:utf8 -*-import requestsimport re# 导入正则表达式的模块from multiprocessing.dummy import Pool as ThreadPoolimport time#导入下面这几行用中文就不会出错import sysreload(sys)sys.setdefaultencoding("utf-8")def changepage(url,total_page):    """获得所有url的列表"""    nowPage = int(re.search('pageNum=(\d+)',url).group(1))#re.search只找到第一个符合条件的 .group(1)是只要一个括号里面的    all_link=[]    for i in range(nowPage,total_page+1):         # 替换         link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)         all_link.append(link)    return all_linkdef geteveryclass(url):    html = requests.get(url)    classinfo=[]    everyclass = re.findall('(deg="0" >.*?</li>)',html.text,re.S)    for each in everyclass:        info = getinfo(each)        classinfo.append(info)    print classinfodef getinfo(eachclass):    """getinfo用来从每个课程块中提取出我们需要的信息"""    info = {}    info['title'] = re.search('class="lessonimg" title="(.*?)>',eachclass,re.S).group(1)    info['content'] = re.search('display: none;">(.*?)</p>',eachclass,re.S).group(1)    return info######################################################################################################################if __name__ == "__main__":    url = 'http://www.jikexueyuan.com/course/?pageNum=1'    url_list = changepage(url,5)    pool = ThreadPool(4)#4核    time1 = time.time()    results = pool.map(geteveryclass, url_list)#需要执行的方法和地址列表    pool.close()    pool.join()#等待线程都结束后再执行主模块    time2 = time.time()    print u'多线程耗时:' + str(time2-time1)

0 0
原创粉丝点击