python写爬虫5-多进程爬虫(采集58出租房信息)

来源：互联网发布：教育软件哪个好编辑：程序博客网时间：2024/04/29 22:40
python写爬虫5-多进程爬虫(采集58出租房信息)

本文代码是在【python写爬虫4-多线程爬虫(采集58出租房信息)】http://blog.csdn.net/apple9005/article/details/54971151博文的基础上编写
多进程爬虫

#! /usr/bin/env python# -*- coding:utf-8 -*-import urllib2import lxml.htmlimport timefrom lxml.cssselect import CSSSelectorfrom MongoCache import MongoCacheimport threadingimport multiprocessingdef download(url, user_agent='Google', num_retries=2):    """下载整个页面"""    print 'Downloading:', url    # 设置用户代理    headers = {'User-agent': user_agent}    request = urllib2.Request(url, headers=headers)    try:        html = urllib2.urlopen(request).read()    except urllib2.URLError as e:        print 'Downloading error:', e.reason        html = None        # 只有在服务器报500-600错误时，才会重试下载，仅重试2次        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, num_retries-1)    return htmldef get_data(url, process_name):    """从详细页面 获取各字段数据"""    print '------Process Name: %s-----------Thread Name: %s-------' % (process_name, threading.current_thread().getName())    #  如果缓存中有该页面数据，则直接获取使用；否则，先下载页面，再使用    cache = MongoCache()    if not cache.__getitem__(url):        html_text_detail = download(url)        if not html_text_detail:            print 'None:', url        else:            cache.__setitem__(url, html_text_detail)    else:        print 'Exists:', url        html_text_detail = cache.__getitem__(url)    try:        #  获取个字段数据        tree = lxml.html.fromstring(html_text_detail)        house_title = CSSSelector('div.main-wrap > div.house-title > h1')        house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)')        house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)')        print house_title(tree)[0].text_content()        print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content())        for i in range(7):            for j in range(2):                css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1)                house_info = CSSSelector(css)                print house_info(tree)[0].text_content().replace(' ', '')    except TypeError as e:        print 'HTML文本发生错误：%s' % e    except IndexError as e:        print '获取详细数据发生错误：%s' % edef get_url(html):    """获取需爬取数据的链接集"""    tree = lxml.html.fromstring(html)    sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a')    url_list = []    for i in sel(tree):        if i.get('href') not in url_list:            url_list.append(i.get('href'))    return url_listdef create_thread(url_list, process_name):    """开启多个线程,最多4个，当少于4个时，会再开启另一个线程，直至全部url被采集"""    while True:        if threading.active_count() >= 4:            time.sleep(1)        else:            lock.acquire()  # 获取锁            if len(url_list) > 0:                thr = threading.Thread(target=get_data, args=(url_list.pop(), process_name))                lock.release()  # 释放锁                thr.start()            else:                lock.release()  # 释放锁                breakif __name__ == '__main__':    url_index = 'http://bj.58.com/chuzu/'    html_text_list = download(url_index)    global url_list  # 需采集的url列表    lock = threading.Lock()    url_list = get_url(html_text_list)    processes = []    for i in range(2):        p = multiprocessing.Process(target=create_thread, args=(url_list, i))  # 启动2进程，每个进程中最多开启4线程        p.start()        processes.append(p)    for p in processes:        p.join()
1 0