
来源:互联网 发布:jsp和php 编辑:程序博客网 时间:2024/06/05 11:00

  • 1一百万个网站
    • 1用普通方法解析Alexa列表
    • 2复用爬虫代码解析Alexa列表
  • 2串行爬虫
  • 3并发并行爬虫
    • 0并发并行工作原理
    • 1多线程爬虫
    • 2多进程爬虫
  • 4性能对比



亚马逊子公司Alexa提供了最受欢迎的100万个网站列表(http://www.alexa.com/topsites ),我们也可以通过http://s3.amazonaws.com/alexa-static/top-1m.csv.zip 直接下载这一列表的压缩文件,这样就不用去提取Alexa网站的数据了。

排名 域名 1 google.com 2 youtube.com 3 facebook.com 4 baidu.com 5 yahoo.com 6 wikipedia.com 7 google.co.in 8 amazon.com 9 qq.com 10 google.co.jp 11 live.com 12 taobao.com


- 下载.zip文件;
- 从.zip文件中提取出CSV文件;
- 解析CSV文件;
- 遍历CSV文件中的每一行,从中提取出域名数据。

# -*- coding: utf-8 -*-import csvfrom zipfile import ZipFilefrom StringIO import StringIOfrom downloader import Downloaderdef alexa():    D = Downloader()    zipped_data = D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')    urls = [] # top 1 million URL's will be stored in this list    with ZipFile(StringIO(zipped_data)) as zf:        csv_filename = zf.namelist()[0]        for _, website in csv.reader(zf.open(csv_filename)):            urls.append('http://' + website)    return urlsif __name__ == '__main__':    print len(alexa())




# -*- coding: utf-8 -*-import csvfrom zipfile import ZipFilefrom StringIO import StringIOfrom mongo_cache import MongoCacheclass AlexaCallback:    def __init__(self, max_urls=1000):        self.max_urls = max_urls        self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'    def __call__(self, url, html):        if url == self.seed_url:            urls = []            #cache = MongoCache()            with ZipFile(StringIO(html)) as zf:                csv_filename = zf.namelist()[0]                for _, website in csv.reader(zf.open(csv_filename)):                    if 'http://' + website not in cache:                        urls.append('http://' + website)                        if len(urls) == self.max_urls:                            break            return urls



# -*- coding: utf-8 -*-from link_crawler import link_crawlerfrom mongo_cache import MongoCachefrom alexa_cb import AlexaCallbackdef main():    scrape_callback = AlexaCallback()    cache = MongoCache()    #cache.clear()    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)if __name__ == '__main__':    main()

time python ...







import timeimport threadingimport urlparsefrom downloader import DownloaderSLEEP_TIME = 1def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='Wu_Being', proxies=None, num_retries=1, max_threads=10, timeout=60):    """Crawl this website in multiple threads    """    # the queue of URL's that still need to be crawled    #crawl_queue = Queue.deque([seed_url])    crawl_queue = [seed_url]    # the URL's that have been seen     seen = set([seed_url])    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)    def process_queue():        while True:            try:                url = crawl_queue.pop()            except IndexError:                # crawl queue is empty                break            else:                html = D(url)                if scrape_callback:                    try:                        links = scrape_callback(url, html) or []                    except Exception as e:                        print 'Error in callback for: {}: {}'.format(url, e)                    else:                        for link in links:                            link = normalize(seed_url, link)                            # check whether already crawled this link                            if link not in seen:                                seen.add(link)                                # add this new link to queue                                crawl_queue.append(link)    # wait for all download threads to finish    threads = []    while threads or crawl_queue:        # the crawl is still active        for thread in threads:            if not thread.is_alive():                # remove the stopped threads                threads.remove(thread)        while len(threads) < max_threads and crawl_queue:            # can start some more threads            thread = threading.Thread(target=process_queue)            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c            thread.start()            threads.append(thread)        # all threads have been processed        # sleep temporarily so CPU can focus execution on other threads        time.sleep(SLEEP_TIME)def normalize(seed_url, link):    """Normalize this URL by removing hash and adding domain    """    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates    return urlparse.urljoin(seed_url, link)


# -*- coding: utf-8 -*-import sysfrom threaded_crawler import threaded_crawlerfrom mongo_cache import MongoCachefrom alexa_cb import AlexaCallbackdef main(max_threads):    scrape_callback = AlexaCallback()    cache = MongoCache()    #cache.clear()    threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)if __name__ == '__main__':    max_threads = int(sys.argv[1])    main(max_threads)

$time python 3threaded_test.py 5



# -*- coding: utf-8 -*-import sysfrom process_crawler import process_crawlerfrom mongo_cache import MongoCachefrom alexa_cb import AlexaCallbackdef main(max_threads):    scrape_callback = AlexaCallback()    cache = MongoCache()    cache.clear()    process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10) ##process_crawlerif __name__ == '__main__':    max_threads = int(sys.argv[1])    main(max_threads)

- 内建的队列换成基于MongoDB的新队列MongoQueue
- 由于队列内部实现中处理重复URL的问题,因此不再需要seen变量;
- 在URL处理结束后调用complete()方法,用于记录该URL已经被成功解析。

import timeimport urlparseimport threadingimport multiprocessingfrom mongo_cache import MongoCachefrom mongo_queue import MongoQueuefrom downloader import DownloaderSLEEP_TIME = 1### process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)def process_crawler(args, **kwargs):    #args:number of args, kwargs:args list    num_cpus = multiprocessing.cpu_count()    #pool = multiprocessing.Pool(processes=num_cpus)    print 'Starting {} processes...'.format(num_cpus)   ######################    processes = []    for i in range(num_cpus):        p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)### threaded_crawler        #parsed = pool.apply_async(threaded_link_crawler, args, kwargs)        p.start()        processes.append(p)    # wait for processes to complete    for p in processes:        p.join()def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wu_being', proxies=None, num_retries=1, max_threads=10, timeout=60):    """Crawl using multiple threads    """    # the queue of URL's that still need to be crawled    crawl_queue = MongoQueue()  ######################    crawl_queue.clear()     ######################    crawl_queue.push(seed_url)  ######################    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)    def process_queue():        while True:            # keep track that are processing url            try:                url = crawl_queue.pop() ######################            except KeyError:                # currently no urls to process                break            else:                html = D(url)                if scrape_callback:                    try:                        links = scrape_callback(url, html) or []                    except Exception as e:                        print 'Error in callback for: {}: {}'.format(url, e)                    else:                        for link in links:      #############                            # add this new link to queue######################                            crawl_queue.push(normalize(seed_url, link))######################                crawl_queue.complete(url)       ######################    # wait for all download threads to finish    threads = []    while threads or crawl_queue:           ######################        for thread in threads:            if not thread.is_alive():                threads.remove(thread)        while len(threads) < max_threads and crawl_queue.peek():    #######################            # can start some more threads            thread = threading.Thread(target=process_queue)            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c            thread.start()            threads.append(thread)        time.sleep(SLEEP_TIME)def normalize(seed_url, link):    """Normalize this URL by removing hash and adding domain    """    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates    return urlparse.urljoin(seed_url, link)

- PROCESSING:队列中取出准备下载时;
- COMPLETE:完成下载时。


from datetime import datetime, timedeltafrom pymongo import MongoClient, errorsclass MongoQueue:    """    >>> timeout = 1    >>> url = 'http://example.webscraping.com'    >>> q = MongoQueue(timeout=timeout)    >>> q.clear() # ensure empty queue    >>> q.push(url) # add test URL    >>> q.peek() == q.pop() == url # pop back this URL    True    >>> q.repair() # immediate repair will do nothin    >>> q.pop() # another pop should be empty    >>> q.peek()     >>> import time; time.sleep(timeout) # wait for timeout    >>> q.repair() # now repair will release URL    Released: test    >>> q.pop() == url # pop URL again    True    >>> bool(q) # queue is still active while outstanding    True    >>> q.complete(url) # complete this URL    >>> bool(q) # queue is not complete    False    """    # possible states of a download    OUTSTANDING, PROCESSING, COMPLETE = range(3)    def __init__(self, client=None, timeout=300):        """        host: the host to connect to MongoDB        port: the port to connect to MongoDB        timeout: the number of seconds to allow for a timeout        """        self.client = MongoClient() if client is None else client        self.db = self.client.cache        self.timeout = timeout    def __nonzero__(self):        """Returns True if there are more jobs to process        """        record = self.db.crawl_queue.find_one(            {'status': {'$ne': self.COMPLETE}}         )        return True if record else False    def push(self, url):        """Add new URL to queue if does not exist        """        try:            self.db.crawl_queue.insert({'_id': url, 'status': self.OUTSTANDING})        except errors.DuplicateKeyError as e:            pass # this is already in the queue    def pop(self):        """Get an outstanding URL from the queue and set its status to processing.        If the queue is empty a KeyError exception is raised.        """        record = self.db.crawl_queue.find_and_modify(            query={'status': self.OUTSTANDING},             update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}}        )        if record:            return record['_id']        else:            self.repair()            raise KeyError()    def peek(self):        record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})        if record:            return record['_id']    def complete(self, url):        self.db.crawl_queue.update({'_id': url}, {'$set': {'status': self.COMPLETE}})    def repair(self):        """Release stalled jobs        """        record = self.db.crawl_queue.find_and_modify(            query={                'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},                'status': {'$ne': self.COMPLETE}            },            update={'$set': {'status': self.OUTSTANDING}}        )        if record:            print 'Released:', record['_id']    def clear(self):        self.db.crawl_queue.drop()


脚本 线程数 进程数 时间 与串行时间比 串行 1 1 多线程 5 1 多线程 10 1 多线程 20 1 多进程 5 2 多进程 10 2 多进程 20 2


Wu_Being 博客声明:本人博客欢迎转载,请标明博客原文和原链接!谢谢!

Wu_Being 吴兵博客接受赞助费二维码


0 0