多进程爬虫

来源：互联网发布：瞻博网络中国裁员编辑：程序博客网时间：2024/06/05 06:06
1. mongoqueue.py
from datetime import datetime, timedeltafrom pymongo import MongoClient, errorsclass MongoQueue:    OUTSTANDING, PROCESSING, COMPLETE = range(3)    def __init__(self, client=None, timeout=300):        self.client = MongoClient() if client is None else client        self.db = self.client.cache        self.timeout = timeout    def __nonzero__(self):        record = self.db.crawl_queue.find_one(            {'status': {'$ne': self.COMPLETE}}        )        return True if record else False    def push(self, url):        try:            self.db.crawl_queue.insert({'id': url, 'status':                                        self.OUTSTANDING})        except errors.DuplicateKeyError as e:            pass    def pop(self):        record = self.db.crawl_queue.find_and_modify(            query={'status': self.OUTSTANDING},            update={'$set': {'status': self.PROCESSING,                             'timestamp': datetime.now()}}        )        if record:            return record['_id']        else:            self.repair()            raise KeyError()    def complete(self, url):        record = self.db.crawl_queue.find_and_modify(            query={                'timestamp': {'$lt': datetime.now() -                    timedelta(seconds=self.timeout)},                'status': {'$ne': self.COMPLETE}            },            update={'$set': {'status': self.OUTSTANDING}}        )        if record:            print 'Released', record['_id']2.process_crawler.py# -*- coding: utf-8 -*-import timeimport threadingfrom downloader import Downloaderimport urlparseimport robotparserimport csvimport reimport lxmlfrom mongoqueue import MongoQueueSLEEP_TIME = 1DEFAULT_AGENT = 'wswp'DEFAULT_DELAY = 5DEFAULT_RETRIES = 1DEFAULT_TIMEOUT = 60def process_crawler(seed_url,cache=None, delay=DEFAULT_DELAY,                     user_agent='wswp', proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT,                     sleep_time=SLEEP_TIME,max_threads=10, scrape_callback=None):    #多线程    # crawl_queue = [seed_url]    # seen = set([seed_url])    #多进程    crawl_queue = MongoQueue()    crawl_queue.push(seed_url)    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)    def process_queue():        while True:            try:                url = crawl_queue.pop()            except IndexError:                break            else:                html = D(url)                if scrape_callback:                    try:                        links = scrape_callback(url, html) or []                    except Exception as e:                        print 'Error in callback for: {}: {}'.format(url, e)                    else:                        for link in links:                            #多线程                            # link = normalize(seed_url, link)                            # # check whether already crawled this link                            # if link not in seen:                            #     seen.add(link)                            #     # add this new link to queue                            #     crawl_queue.append(link)                            #多进程                            crawl_queue.push(normalize(seed_url, link))                crawl_queue.complete(url)        threads = []        while threads or crawl_queue:            for thread in threads:                if not thread.is_alive():                    threads.remove(thread)            while len(threads) < max_threads and crawl_queue:                thread = threading.Thread(target=process_queue())                thread.setDaemon(True)                thread.start()                threads.append(thread)                time.sleep(sleep_time)def get_robots(url):    """Initialize robots parser for this domain    """    rp = robotparser.RobotFileParser()    rp.set_url(urlparse.urljoin(url, '/robots.txt'))#绝对链接    rp.read()    return rpdef normalize(seed_url, link):    """Normalize this URL by removing hash and adding domain    """    link, _ = urlparse.urldefrag(link)  # remove hash to avoid duplicates urldefrag(url)将url分解成去掉fragment的新url和去掉的fragment的二元组    return urlparse.urljoin(seed_url, link)#绝对链接def same_domain(url1, url2):    """Return True if both URL's belong to same domain    """    #将url分解成部件的6元组    return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netlocdef get_links(html):    """Return a list of links from html    """    # a regular expression to extract all links from the webpage    #re.compile()函数将正则表达式的字符串形式编译为Pattern实例，然后使用Pattern实例处理文本并获得匹配结果（一个Match实例），最后使用Match实例获得信息，进行其他的操作。    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)    # list of all links from the webpage    return webpage_regex.findall(html)class ScrapeCallback:    def __init__(self):        self.writer = csv.writer(open('countries.csv', 'w'))        self.fields = ('area', 'population', 'iso', 'country', 'capital',          'continent', 'tld', 'currency_code', 'currency_name',          'phone', 'postal_code_format', 'postal_code_regex', 'languages')        self.writer.writerow(self.fields)    def __call__(self, url, html):        if re.search('view', url):            tree = lxml.html.fromstring(html)            row = []            for field in self.fields:                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())                self.writer.writerow(row)

3.multiprocessing.py
import multiprocessingfrom threaded_crawler import threaded_crawlerdef process_link_crawler(args, **kwargs):    num_cpus = multiprocessing.cpu_count()    print 'Staring {} process'.format(num_cpus)    processes = []    for i in range(num_cpus):        p = multiprocessing.process(target=threaded_crawler, args=[args], kwargs=kwargs)        p.start()        processes.append(p)    for p in processes:        p.join()
4.process_test.py
import sysfrom process_crawler import process_crawlerfrom mongo_cache import MongoCachefrom alexa_cb import AlexaCallbackdef main(max_threads):    scrape_callback = AlexaCallback()    cache = MongoCache()    cache.clear()    process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)if __name__ == '__main__':    sys.argv = ['$ time python process_test.py', 5]    max_threads = int(sys.argv[1])    main(max_threads)
阅读全文
0 0