第1.5章 scrapy之pipelines

来源:互联网 发布:五年高考三年模拟淘宝 编辑:程序博客网 时间:2024/05/16 08:10

下面的代码是结合pandas和sqlalchemy将数据写入到mysql数据库中。

# -*- coding: utf-8 -*-# 管道的作用主要是做数据清洗from eie.middlewares import udf_configfrom sqlalchemy.engine import create_engineimport pandas as pdfrom eie import settingsimport threadingfrom scrapy.exceptions import DropItemlogger = udf_config.loggerengine = create_engine('mysql+mysqldb://{}:{}@{}:3306/{}'.format(settings.MYSQL_USER, settings.MYSQL_PASSWD,        settings.MYSQL_HOST, settings.MYSQL_DBNAME), connect_args={'charset': 'utf8'}, pool_size=settings.MYSQL_POOL_SIZE)mutex = threading.Lock()class EiePipeline(object):    def process_item(self, item, spider):        df = pd.DataFrame([item])        logger.debug(df)        try:            df.to_sql('eie_ip', engine, if_exists='append', index=False)        except Exception, e:            raise DropItem('insert to mysql error! %s, %s' % (item, e))        return item    def close_spider(self, spider):        pass
原创粉丝点击