python爬虫进阶（四）：多线程与多进程

来源：互联网发布：淘宝店铺商品拍摄编辑：程序博客网时间：2024/06/04 18:24

一、多线程

多线程基础知识：

主要参考以下两篇文章

http://www.cnblogs.com/qq1207501666/p/6709902.html

http://python.jobbole.com/81546/

（一）、多线程的复杂性

1、资源、数据是安全性：锁保护

2、原子性：数据操作是天然互斥的

3、同步等待：wait（） notify（） notifyAll（）

4、死锁：多个线程对资源互锁，造成死锁

5、容灾：任何线程出现错误，整个进程都会停止

（二）、多线程的优势

1、内存空间共享，信息数据交换效率高

2、提高CPU是使用率

3、开发便捷

4、轻，创建、销毁的开销小

（三）、实现一个多线程的爬虫

1、创建一个线程池threads = [ ]

2、确认url队列线程安全 Queue Deque

3、从队列取出url，分配一个线程开始爬取 pop()/get() threading.Thread

4、如果线程池满了，循环等待，直到有线程结束t.is_alive()

5、从线程池移除已经完成下载的线程 threads.remove(t)

6、如果当前级别的url已经遍历完成，t.join() 函数等待所有现场结束，然后开始下一级别的爬取

代码：

import urllib.requestfrom collections import dequefrom lxml import etreeimport http.clientimport hashlibfrom pybloom import BloomFilterimport threadingimport timeclass CrawlBSF:    request_headers = {        'host': "www.mafengwo.cn",        'connection': "keep-alive",        'cache-control': "no-cache",        'upgrade-insecure-requests': "1",        'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"    }    cur_level = 0    max_level = 5    dir_name = 'iterate/'    iter_width = 50    downloaded_urls = []    du_md5_file_name = dir_name + 'download.txt'    du_url_file_name = dir_name + 'urls.txt'    bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01)    bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01)    cur_queue = deque()    child_queue = deque()    def __init__(self, url):        self.root_url = url        self.cur_queue.append(url)        self.du_file = open(self.du_url_file_name, 'a+')        try:            self.dumd5_file = open(self.du_md5_file_name, 'r')            self.downloaded_urls = self.dumd5_file.readlines()            self.dumd5_file.close()            for urlmd5 in self.downloaded_urls:                self.bloom_downloaded_urls.add(urlmd5[:-2])        except IOError:            print("File not found")        finally:            self.dumd5_file = open(self.du_md5_file_name, 'a+')    def enqueueUrl(self, url):        if url not in self.bloom_url_queue and hashlib.md5(url.encode('gb2312')).hexdigest() not in crawler.bloom_downloaded_urls:            self.child_queue.append(url)            self.bloom_url_queue.add(url)    def dequeuUrl(self):        try:            url = self.cur_queue.popleft()            return url        except IndexError:            return None    def close(self):        self.dumd5_file.close()        self.du_file.close()num_downloaded_pages = 0#download the page contentdef get_page_content(cur_url):    global num_downloaded_pages    print("downloading %s at level %d" % (cur_url, crawler.cur_level))    try:        req = urllib.request.Request(cur_url, headers=crawler.request_headers)        response = urllib.request.urlopen(req)        html_page = response.read()        filename = cur_url[7:].replace('/', '_')        fo = open("%s%s.html" % (crawler.dir_name, filename), 'wb+')        fo.write(html_page)        fo.close()    except urllib.request.HTTPError as Arguments:        print(Arguments)        return    except http.client.BadStatusLine as Arguments:        print(Arguments)        return    except IOError as Arguments:        print(Arguments)        return    except Exception as Arguments:        print(Arguments)        return    # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'    # save page and set bloomfilter    dumd5 = hashlib.md5(cur_url.encode('gb2312')).hexdigest()    crawler.downloaded_urls.append(dumd5)    crawler.dumd5_file.write(dumd5 + '\r\n')    crawler.du_file.write(cur_url + '\r\n')    crawler.bloom_downloaded_urls.add(dumd5)    num_downloaded_pages += 1    html = etree.HTML(html_page.lower().decode('utf-8'))    hrefs = html.xpath(u"//a")    for href in hrefs:        try:            if 'href' in href.attrib:                val = href.attrib['href']                if val.find('javascript:') != -1:                    continue                if val.startswith('http://') is False:                    if val.startswith('/'):                        val = 'http://www.mafengwo.cn' + val                    else:                        continue                if val[-1] == '/':                    val = val[0:-1]                # if hashlib.md5(val).hexdigest() not in self.downloaded_urls:                crawler.enqueueUrl(val)                # else:                    # print 'Skip %s' % (val)        except ValueError:            continuecrawler = CrawlBSF("http://www.mafengwo.cn")start_time = time.time()# if it's the first page (start url), if true, crawl it in main thread in sync(blocking) mode# 如果是第一个抓取页面的话，在主线程用同步（阻塞）的模式下载，后续的页面会通过创建子线程的方式异步爬取is_root_page = Truethreads = []max_threads = 10CRAWL_DELAY = 0.6while True:    url = crawler.dequeuUrl()    # Go on next level, before that, needs to wait all current level crawling done    if url is None:        crawler.cur_level += 1        for t in threads:            t.join()        if crawler.cur_level == crawler.max_level:            break        if len(crawler.child_queue) == 0:            break        crawler.cur_queue = crawler.child_queue        crawler.child_queue = deque()        continue    # looking for an empty thread from pool to crawl    if is_root_page is True:        get_page_content(url)        is_root_page = False    else:        while True:            # first remove all finished running threads            for t in threads:                if not t.is_alive():                    threads.remove(t)            if len(threads) >= max_threads:                time.sleep(CRAWL_DELAY)                continue            try:                t = threading.Thread(target=get_page_content, name=None, args=(url,))                threads.append(t)                # set daemon so main thread can exit when receives ctrl-c                t.setDaemon(True)                t.start()                time.sleep(CRAWL_DELAY)                break            except Exception:                print("Error: unable to start thread")print('%d pages downloaded, time cost %0.2f seconds' % (num_downloaded_pages, time.time()-start_time))

（四）、多线程爬虫的评价

优势：

1、有效利用CPU时间

2、极大减小下载出错、阻塞对抓取速度的影响，整体上提高下载的速度

3、对于没有反爬虫限制的网站，下载速度可以多倍增加

局限性：

1、对于有反爬的网站，速度提升有限

2、提高了复杂度，对编码要求更高

3、线程越多，每个线程获得的时间就越少，同时线程切换会增加额外的开销

4、线程之间资源竞争更激烈

但是，对于大型网站，虽然网站有反爬虫措施，当仍然可以使用多线程，而且必须用，这会大大加快爬取速度。

一般的，对于大型网站，有很服务器，如北京有，上海有，重庆有，你同一个爬虫不同线程访问不同地区的服务器是不受影响的。

二、多进程

多进程教程参考：非常详细

http://www.cnblogs.com/smallmars/p/7093603.html

（一）、多进程爬虫评估

目的：

1、控制线程数量

2、对线程进行隔离，减少资源竞争

3、某些环境下，在单机上利用多个IP来伪装

局限性：

1、不能突破网络瓶颈

2、单机单IP的情况下，变得没有意义

3、数据交换的代价更大

（二）、创建多进程爬虫

C/S模式

1、一个服务进程，入队及出队url，入队需检查是否已经下载

2、监控目前的爬取状态、进度

3、多个爬取进程，从服务进程获取url，并将新的url返回给服务进程

4、使用Socket来做IPC

数据库模式

1、使用数据库来读写爬取列表

2、多个爬取进程，url的获取与增加都通过数据库操作

（三）、C/S v.s. 数据库

CS：

运行速度快，添加、修改、查询都是内存的bit位操作

扩展方便，例如动态url队列重拍

数据库：

开发便捷，数据库天生具备读写保护及支持IPC

只需要写一个爬虫程序

使用MySQLConnectionPool来管理多线程下的MySQL数据库连接

self.cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name="mypool",
pool_size=max_num_thread,
**dbconfig)

con = self.cnxpool.get_connection()
cursor = con.cursor()

代码：

数据库table创建以及多线程下的连接

dbmanager.py

import mysql.connectorimport hashlibfrom mysql.connector import errorcodeclass CrawlDatabaseManager:    DB_NAME = 'mfw_pro_crawl'    SERVER_IP = 'localhost'    TABLES = {}    # create new table, using sql    TABLES['urls'] = (        "CREATE TABLE `urls` ("        "  `index` int(11) NOT NULL AUTO_INCREMENT," # index of queue        "  `url` varchar(512) NOT NULL,"        "  `md5` varchar(16) NOT NULL,"        "  `status` varchar(11) NOT NULL DEFAULT 'new'," # could be new, downloading and finish        "  `depth` int(11) NOT NULL,"        "  `queue_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,"        "  `done_time` timestamp NOT NULL DEFAULT 0 ON UPDATE CURRENT_TIMESTAMP,"        "  PRIMARY KEY (`index`),"        "  UNIQUE KEY `md5` (`md5`)"        ") ENGINE=InnoDB")    def __init__(self, max_num_thread):        # connect mysql server        try:            cnx = mysql.connector.connect(host=self.SERVER_IP, user='root')        except mysql.connector.Error as err:            if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:                print "Something is wrong with your user name or password"            elif err.errno == errorcode.ER_BAD_DB_ERROR:                print "Database does not exist"            else:                print 'Create Error ' + err.msg            exit(1)        cursor = cnx.cursor()        # use database, create it if not exist        try:            cnx.database = self.DB_NAME        except mysql.connector.Error as err:            if err.errno == errorcode.ER_BAD_DB_ERROR:                # create database and table                self.create_database(cursor)                cnx.database = self.DB_NAME                self.create_tables(cursor)            else:                print err                exit(1)        finally:            cursor.close()            cnx.close()        dbconfig = {            "database": self.DB_NAME,            "user":     "root",            "host":     self.SERVER_IP,        }        self.cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name="mypool",                                                          pool_size=max_num_thread,                                                          **dbconfig)    # create databse    def create_database(self, cursor):        try:            cursor.execute(                "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(self.DB_NAME))        except mysql.connector.Error as err:            print "Failed creating database: {}".format(err)            exit(1)    def create_tables(self, cursor):        for name, ddl in self.TABLES.iteritems():            try:                cursor.execute(ddl)            except mysql.connector.Error as err:                if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:                    print 'create tables error ALREADY EXISTS'                else:                    print 'create tables error ' + err.msg            else:                print 'Tables created'    # put an url into queue    def enqueueUrl(self, url, depth):        con = self.cnxpool.get_connection()        cursor = con.cursor()        try:            add_url = ("INSERT INTO urls (url, md5, depth) VALUES (%s, %s, %s)")            data_url = (url, hashlib.md5(url).hexdigest(), depth)            cursor.execute(add_url, data_url)            # commit this transaction, please refer to "mysql transaction" for more info            con.commit()        except mysql.connector.Error as err:            # print 'enqueueUrl() ' + err.msg            return        finally:            cursor.close()            con.close()    # get an url from queue    def dequeueUrl(self):        con = self.cnxpool.get_connection()        cursor = con.cursor(dictionary=True)        try:            # use select * for update to lock the rows for read            query = ("SELECT `index`, `url`, `depth` FROM urls WHERE status='new' ORDER BY `index` ASC LIMIT 1 FOR UPDATE")            cursor.execute(query)            if cursor.rowcount is 0:                return None            row = cursor.fetchone()            update_query = ("UPDATE urls SET `status`='downloading' WHERE `index`=%d") % (row['index'])            cursor.execute(update_query)            con.commit()            return row        except mysql.connector.Error as err:            # print 'dequeueUrl() ' + err.msg            return None        finally:            cursor.close()            con.close()    def finishUrl(self, index):        con = self.cnxpool.get_connection()        cursor = con.cursor()        try:            # we don't need to update done_time using time.strftime('%Y-%m-%d %H:%M:%S') as it's auto updated            update_query = ("UPDATE urls SET `status`='done' WHERE `index`=%d") % (index)            cursor.execute(update_query)            con.commit()        except mysql.connector.Error as err:            # print 'finishUrl() ' + err.msg            return        finally:            cursor.close()            con.close()

主程序，爬取内容以及多进程实现：

multi_process.py

import urllib2from collections import dequeimport jsonfrom lxml import etreeimport httplibimport hashlibfrom pybloomfilter import BloomFilterimport threadimport threadingimport timefrom dbmanager import CrawlDatabaseManagerfrom mysql.connector import errorcodeimport mysql.connectorrequest_headers = {    'host': "www.mafengwo.cn",    'connection': "keep-alive",    'cache-control': "no-cache",    'upgrade-insecure-requests': "1",    'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",    'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"}def get_page_content(cur_url, index, depth):    print "downloading %s at level %d" % (cur_url, depth)    try:        req = urllib2.Request(cur_url, headers=request_headers)        response = urllib2.urlopen(req)        html_page = response.read()        filename = cur_url[7:].replace('/', '_')        fo = open("%s%s.html" % (dir_name, filename), 'wb+')        fo.write(html_page)        fo.close()        dbmanager.finishUrl(index)    except urllib2.HTTPError, Arguments:        print Arguments        return    except httplib.BadStatusLine, Arguments:        print Arguments        return    except IOError, Arguments:        print Arguments        return    except Exception, Arguments:        print Arguments        return    # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'    html = etree.HTML(html_page.lower().decode('utf-8'))    hrefs = html.xpath(u"//a")    for href in hrefs:        try:            if 'href' in href.attrib:                val = href.attrib['href']                if val.find('javascript:') != -1:                    continue                if val.startswith('http://') is False:                    if val.startswith('/'):                        val = 'http://www.mafengwo.cn' + val                    else:                        continue                if val[-1] == '/':                    val = val[0:-1]                dbmanager.enqueueUrl(val, depth + 1)        except ValueError:            continuemax_num_thread = 5# create instance of Mysql database manager, which is used as a queue for crawlingdbmanager = CrawlDatabaseManager(max_num_thread)# dir for saving HTML filesdir_name = 'dir_process/'# put first page into queuedbmanager.enqueueUrl("http://www.mafengwo.cn", 0)start_time = time.time()is_root_page = Truethreads = []# time delay before a new crawling thread is created# use a delay to control the crawling rate, avoiding visiting target website too frequently# 设置超时，控制下载的速率，避免太过频繁访问目标网站CRAWL_DELAY = 0.6while True:    curtask = dbmanager.dequeueUrl()    # Go on next level, before that, needs to wait all current level crawling done    if curtask is None:        for t in threads:            t.join()        break    # looking for an empty thread from pool to crawl    if is_root_page is True:        get_page_content(curtask['url'], curtask['index'], curtask['depth'])        is_root_page = False    else:        while True:                # first remove all finished running threads            for t in threads:                if not t.is_alive():                    threads.remove(t)            if len(threads) >= max_num_thread:                time.sleep(CRAWL_DELAY)                continue            try:                t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['index'], curtask['depth']))                threads.append(t)                # set daemon so main thread can exit when receives ctrl-c                t.setDaemon(True)                t.start()                time.sleep(CRAWL_DELAY)                break            except Exception:                print "Error: unable to start thread"cursor.close()cnx.close()

^_^

阅读全文

0 0