python爬虫<在Pipeline创建数据连接池批量写入MySQL>

来源：互联网发布：局域网控制桌面软件编辑：程序博客网时间：2024/06/16 11:21

以下是连接池的参数说明：

 def __init__(self, creator,            mincached=0, maxcached=0,            maxshared=0, maxconnections=0, blocking=False,            maxusage=None, setsession=None, reset=True,            failures=None, ping=1,            *args, **kwargs):        """Set up the DB-API 2 connection pool.        creator: either an arbitrary function returning new DB-API 2            connection objects or a DB-API 2 compliant database module        mincached: initial number of idle connections in the pool            (0 means no connections are made at startup)        maxcached: maximum number of idle connections in the pool            (0 or None means unlimited pool size)        maxshared: maximum number of shared connections            (0 or None means all connections are dedicated)            When this maximum number is reached, connections are            shared if they have been requested as shareable.        maxconnections: maximum number of connections generally allowed            (0 or None means an arbitrary number of connections)        blocking: determines behavior when exceeding the maximum            (if this is set to true, block and wait until the number of            connections decreases, otherwise an error will be reported)        maxusage: maximum number of reuses of a single connection            (0 or None means unlimited reuse)            When this maximum usage number of the connection is reached,            the connection is automatically reset (closed and reopened).        setsession: optional list of SQL commands that may serve to prepare            the session, e.g. ["set datestyle to ...", "set time zone ..."]        reset: how connections should be reset when returned to the pool            (False or None to rollback transcations started with begin(),            True to always issue a rollback for safety's sake)        failures: an optional exception class or a tuple of exception classes            for which the connection failover mechanism shall be applied,            if the default (OperationalError, InternalError) is not adequate        ping: determines when the connection should be checked with ping()            (0 = None = never, 1 = default = whenever fetched from the pool,            2 = when a cursor is created, 4 = when a query is executed,            7 = always, and all other bit combinations of these values)        args, kwargs: the parameters that shall be passed to the creator            function or the connection constructor of the DB-API 2 module        """

在 pipeline中的代码，以下代码是以一千条数据为一批，在spider开启的时候创建连接池，spider关闭的时候，将可能不足一批的数据写入数据库同时关闭连接池：

# -*- coding: utf-8 -*-import sysimport MySQLdbdefault_encoding = 'utf-8'if sys.getdefaultencoding() != default_encoding:    reload(sys)    sys.setdefaultencoding(default_encoding)from DBUtils.PooledDB import PooledDBclass MySQLStorePipeline(object):    pgs = []    def open_spider(self, spider):        self.pool = PooledDB(creator=MySQLdb, maxcached=5,maxshared=5, host='localhost', user='root', passwd='root', db='vboxdb', port=3306,                             charset="utf8")        self.conn = self.pool.connection()        self.cursor = self.conn.cursor()        # 清空表：        self.cursor.execute("truncate table epg")        self.conn.commit()    # 批量插入    def bulk_insert_to_mysql(self, bulkdata):        try:            print "inserting data in batch--->>>>>", len(self.pgs)            sql = """INSERT INTO epg (date, channel, time, pg) VALUES (%s, %s, %s, %s)"""            self.cursor.executemany(sql, bulkdata)            self.conn.commit()        except:            self.conn.rollback()    def process_item(self, item, spider):        # print item['date'], item['channel'], item['time'], item['pg']        self.pgs.append((item['date'], item['channel'], item['time'], item['pg']))        if len(self.pgs) == 1000:            self.bulk_insert_to_mysql(self.pgs)            # 清空缓冲区            del self.pgs[:]        return item    # spider结束    def close_spider(self, spider):        print "closing spider,last commit", len(self.pgs)        self.bulk_insert_to_mysql(self.pgs)        self.conn.commit()        self.cursor.close()        self.conn.close()

阅读全文

0 0