爬虫入门四（多线程爬虫）

来源：互联网发布：慕课平台有哪些知乎编辑：程序博客网时间：2024/05/17 14:28

1.单线程实现网页下载

#coding=utf-8import  datetimeimport csvfrom day3 import link_crawler #略有改动。same_domain修改一下。html编码改一下。from mongoDB import MongoCache#前面实现的def getUrl(number):    urls=[]    path='D:/top-1m.csv/top-1m.csv'    for _,website in csv.reader(open(path)):        urls.append('http://'+website)        if len(urls)!=number:            print _,website        else :            return urlsfrom  datetime import  timedeltadef main():    starttime = datetime.datetime.now()    cache = MongoCache(expires=timedelta())    cache.clear()    link_crawler('http://example.webscraping.com',scrape_callback=getUrl(10), cache=cache,delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')    endtime = datetime.datetime.now()    print (endtime - starttime).secondsif __name__ == '__main__':    main()

80多秒
https://bitbucket.org/wswp/code/src/9e6b82b47087c2ada0e9fdf4f5e037e151975f0f?at=default
这本书的源码网站。都是片段式的。贼烦。
运行出现bson.errors.InvalidStringData: strings in documents must be valid UTF-8
插入数据库的时候出现了问题，s.decode(“unicode_escape”);直接搞定。http://blog.csdn.net/woshicsdn7547/article/details/41678093

2.多线程

#coding=utf-8from day3 import Downloaderfrom day3 import normalizefrom mongoDB import MongoCachefrom chapter4 import getUrlimport  datetimeimport  threadingimport  timeSLEEP_TIME=1def threaded_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1,cache=None,scrape_callback=None,max_threads=10):    crawl_queue=[seed_url]    seen=set([seed_url])    D=Downloader(cache=cache,delay=delay,user_agent=user_agent,proxies=proxy,num_retries=num_retries)    def process_queue():        while True:            try:                url = crawl_queue.pop()            except IndexError:                # crawl queue is empty                break            else:                html = D(url)#调用__call__，获得内容                if scrape_callback:                    try:                        links = scrape_callback or []                    except Exception as e:                        print 'Error in callback for: {}: {}'.format(url, e)                    else:                        for link in links:                            link = normalize(seed_url, link)                            # check whether already crawled this link                            if link not in seen:                                seen.add(link)                                # add this new link to queue                                crawl_queue.append(link)    threads = []    while threads or crawl_queue:        for thread in threads:            if not thread.is_alive():                threads.remove(thread)        while len(threads) < max_threads and crawl_queue:            # 创建进程            thread = threading.Thread(target=process_queue)            thread.setDaemon(True)  # 设置为守护线程，完成了之后不管其他的线程有木有完成直接结束            thread.start()#开始            threads.append(thread)        # all threads have been processed        # sleep temporarily so CPU can focus execution on other threads       # time.sleep(SLEEP_TIME)def main():    starttime = datetime.datetime.now()    cache = MongoCache(expires=datetime.timedelta())    cache.clear()    threaded_crawler('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip',scrape_callback=getUrl(10), cache=cache)    endtime = datetime.datetime.now()    print (endtime - starttime).secondsif __name__ == '__main__':    main()

23秒

3.多进程

之前爬虫队列存储在本地的，这样多个进程无法处理，首先实现了利用mongodb模拟队列。0代表刚进去，1代表正处理，2代表处理完成。源码在上面。贴一下关键代码供自己日后复习。

def process_crawler(args, **kwargs):    num_cpus = multiprocessing.cpu_count()#进程数目    print 'Starting {} processes'.format(num_cpus)    processes = []    for i in range(num_cpus):        p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)        #parsed = pool.apply_async(threaded_link_crawler, args, kwargs)        p.start()        processes.append(p)    # wait for processes to complete    for p in processes:        p.join()

4.总结

利用线程和进程的方法提高效率。晚上再找些实例
17/9/10

阅读全文

0 0