Python爬虫数据写入操作

来源：互联网发布：阿里云美国服务器翻墙编辑：程序博客网时间：2024/06/03 18:40
Python Scrapy爬虫数据写入操作在我们写完一个爬虫项目，得到了一大堆的数据，为了以后的分析和使用，我们需要把我们得到的数据进行保存。保存数据的方式主要有：保存到数据库，保存到CSV文件，保存为JSON文件。保存到数据库中分为同步和异步的方式：一般小的数据，我们可以选择，同步保存数据库：首先先准备好数据库文件和数据表操作在pipelines.py文件中完成将item写入数据库import MySQLdbclass MysqlPipeine(object):    def __init__(self):        self.conn = MySQLdb.connect(                    host = 'localhost',                    # mysql默认端口号3306                    port = 3306,                    user = 'root',                    passwd = '123456',                    db = 'db_name',                    use_unicode = True,                    charset = 'utf8'        )        self.cursor = self.conn.cursor()            # 处理item的函数    def process_item(self, item, spider):        # 准备sql语句        sql = 'insert into table_name(字段名)VALUES (字段数据,即item对象)'        self.cursor.execute(sql)        self.conn.commit()        def close_spider(self, spider):        self.cursor.close()        self.conn.close()如果数据量较大，异步写入数据库会更高效：同样操作在我们的pipelines文件中，但在操作之前，需要配置我们的setting文件，在其中添加我们的数据# 自己配置一些项目信息# 数据库配置MYSQL_HOST = '127.0.0.1'MYSQL_PORT = 3306MYSQL_USER = 'root'MYSQL_PASSWD = '123456'MYSQL_CHARSET = 'utf8'MYSQL_DBNAME = 'db_name'然后在pipelines中进行我们异步写入数据库的操作：# 将item写入数据库import MySQLdbclass MysqlPipeine(object):    def __init__(self):        self.conn = MySQLdb.connect(                host = 'localhost',                # mysql默认端口号3306                port = 3306,                user = 'root',                passwd = '123456',                db = 'db_name',                use_unicode = True,                charset = 'utf8'    )        self.cursor = self.conn.cursor()    # 处理item的函数    def process_item(self, item, spider):    # 准备sql语句        sql = 'insert into table_name(字段名)VALUES (字段数据,即item对象)'        self.cursor.execute(sql)        self.conn.commit()    def close_spider(self, spider):        self.cursor.close()        self.conn.close()如果数据量较大，异步写入数据库会更高效：同样操作在我们的pipelines文件中，但在操作之前，需要配置我们的setting文件，在其中添加我们的数据# 自己配置一些项目信息# 数据库配置MYSQL_HOST = '127.0.0.1'MYSQL_PORT = 3306MYSQL_USER = 'root'MYSQL_PASSWD = '123456'MYSQL_CHARSET = 'utf8'MYSQL_DBNAME = 'db_name'然后在pipelines中进行我们异步写入数据库的操作：from twisted.enterprise import adbapifrom MySQLdb import cursorsclass MysqlTwistedPipeline(object):    @classmethod    # 这个函数会自动调用    def from_settings(cls, settings):        # 准备好连接数据库需要的参数        db_params = dict(            host=settings["MYSQL_HOST"],            port=settings["MYSQL_PORT"],            user=settings["MYSQL_USER"],            passwd=settings["MYSQL_PASSWD"],            charset=settings["MYSQL_CHARSET"],            db=settings["MYSQL_DBNAME"],            use_unicode=True,            # 指定游标类型            cursorclass=cursors.DictCursor        )        # 创建连接池        # 1.要连接的名称  2.连接需要的参数        db_pool = adbapi.ConnectionPool('MySQLdb', **db_params)        # 返回当前类的对象，并且把db_pool赋值给该类的对象        return cls(db_pool)    def __init__(self, db_pool):        # 赋值        self.db_pool = db_pool        # 处理item函数    def process_item(self, item, spider):        # 把要处理的事件进行异步处理        # 1.要处理的事件函数        # 2.事件函数需要的参数        query = self.db_pool.runInteraction(self.do_insert, item)        # 执行sql出现错误信息        query.addErrback(self.handle_error, item, spider)        # 错误的原因    def handle_error(self, failure, item, spider):        print failure    # 处理插入数据库的操作    # cursor该函数是连接数据库的函数，并且放在异步去执行，cursor执行sql语句    def do_insert(self, cursor, item):        # 1.准备sql语句        sql = 'insert into table_name(表中各个字段名)VALUES (各个字段对应的数据item)'        # 2.用cursor游标执行sql        cursor.execute(sql)写入CSV文件的操作：同样操作在我们的pipelines.py文件中进行# 写入csv文件import csvimport codecsclass SaveCSVFile(object):    def __init__(self):        file_handle = codecs.open('name.csv', 'w', encoding='utf-8')        # 1. 创建csv文件        self.csv = csv.writer(file_handle)        self.csv.writerow(表头的信息)    def process_item(self, item, spider):        self.csv.writerow(表头信息所对应的每个item对象)        return item    def __del__(self):        # 关闭文件        self.file_handle.close()写入json文件：import jsonclass JsonPipeline(object):    def __init__(self):        self.file_handle = codecs.open('name.json', 'w', encoding='utf-8')    def process_item(self, item, spider):        data = json.dumps(dict(item), ensure_ascii=False) + "\n"        self.file.write(data)        return item    def __del__(self):        # 关闭文件        self.file_handle.close()在我们完成pipelines中的函数重写后，一定要记得在setting文件中配置，以确保其能正常执行。如果同时在pipelines中进行多种方式的写入的话，切记要在process_item函数中返回item，即return item还有一种简单的保存csv / json文件的方法:在编辑器命令行输入:scrapy crawl spider - o name.json / csv - s FEED_EXPROT_ENCODING = UTF8FEED_EXPORT_ENCODING = UTF8指保证文件中的中文正常显示
阅读全文
1 0