Scrapy 使用记录

来源:互联网 发布:centos 桥接模式 编辑:程序博客网 时间:2024/06/01 22:15

官方文档链接

1.items定义

class HistoryItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    pid = scrapy.Field()  # 项目id    name = scrapy.Field()  # 项目class ForecastItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    pid = scrapy.Field()  # 项目id    name = scrapy.Field()  # 项目

2.文件编码

# coding=utf-8# 文件开头import sysreload(sys)sys.setdefaultencoding('utf-8')

3.spider 写法演示(对原文件进行了修改删减,仅做演示)

# coding=utf-8import scrapyimport sysimport cgifrom wanglu.items import ForecastItem, SupplyItemreload(sys)sys.setdefaultencoding('utf-8')# scrapy crawl wang 命令class WangSpider(scrapy.Spider):    name = "wang"    base_url= "http://www.sample.com/info?id=" #自己的网址    start_urls =[]    #从文件seedall.txt读id参数,每行一个    # def __init__(self):    #     ids = open("seedall.txt","r").readlines()    #     for did in ids:    #         self.start_urls.append(self.base_url+did.replace("\n", "").strip())    #循环 id    def __init__(self):        for did in range(22312,23475):            self.start_urls.append(self.base_url+str(did))    def parse(self,response):        try:            time.sleep(1)        except BaseException as e:            e.__str__()        pid = response.url.split("=")[-1]        table = response.xpath("/html/body/div[@id='basicInfoDiv']/div/table[@class='jbxx_table'][1]")        item = WangluItem()        item["pid"]=response.url.split("=")[-1]        item['company_name']=table.xpath(".//td/label[@id='lblcname']/text()")[0].extract().strip().replace("\r\n", "")        if item["company_name"].strip() == '':            return        yield item         forecastTable =response.css("#businessPlanDiv div > table")[1]        if forecastTable and forecastTable.css("tr"):            for i,tr in enumerate(forecastTable.css("tr")):                if i > 1:                    forecast = Forecast2Item()                    forecast["pid"] = pid                    forecast['year'] = cgi.escape( tr.xpath(".//td[1]/text()")[0].extract().strip().replace("\r\n", ""),True)                    forecast['edu_bg'] = cgi.escape(tr.xpath('.//td[1]')[0].extract().strip().replace("\r\n", ""),True)                    yield forecast

选择器文档链接
3.1 xpath 浏览器工具可以查看,下图为360浏览器获取xpath

这里写图片描述

但是这样获取的并不一定有效,而且可能很长,经常需要根据自己的情况修改xpath.
tbody 是一个小坑,因为源返回值中可能没有tbody ,但是浏览器会自动加上tbody, 造成路径问题,要注意。
3.2 css 选择器, “>” 直接子元素 ,”:nth-child()” 我较常用

注意 :xpath和css方法返回的都是”数组”
cgi.escape(str,True) 将html特殊符号和引号转义,在保存数据到数据库是很有用

取文本 :

tr.xpath(".//td/text()")    tr.css("td:: text")     

取元素 (td中包含其他标签时):

tr.xpath(".//td")    tr.css("td")     

取属性:

supply['file_url'] =tr.xpath(".//td[3]/a/@href")supply['file_url'] =tr.css("a::attr(href)")

更多更详细请查文档

4.pipelines

log

import logginglogger = logging.getLogger('WLog')logger.setLevel(logging.DEBUG)ch = logging.StreamHandler()ch.setLevel(logging.DEBUG)fh = logging.FileHandler("error.log", "a+", "utf-8")fh.setLevel(logging.DEBUG)formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')ch.setFormatter(formatter)fh.setFormatter(formatter)# 给logger添加handler# self.logger.addHandler(ch)logger.addHandler(fh)

将结果以json格式保存到文件

import jsonclass FilePipeline(object):    def __init__(self):        self.file = open('items.jl', 'a+')    def process_item(self, item, spider):        line = json.dumps(dict(item)) + "\n"        self.file.write(line)        return item    def close_spider(self, spider):        self.file.close()

保存到EXCEL

from openpyxl import Workbookclass ExcelPipeline(object):    def __init__(self):        try:            self.wb = Workbook()            # grab the active worksheet            self.ws = self.wb.active        except BaseException as e:            logger.warn(e.__str__())    def process_item(self, item, spider):        try:            self.ws.append([item['did'], item["company_name"], item["industry"], item["area"]])        except BaseException as e:            logger.warn(item["did"] + ":" + e.__str__())        return item    def close_spider(self, spider):        try:            self.wb.save(time.strftime("%y%m%d%H%M%S") + ".xlsx")        except BaseException as e:            logger.warn(e.__str__())

保存到数据库

import loggingimport MySQLdb# 保存到数据库class MySQLPipeline(object):    def __init__(self, db):        self.db = db    @classmethod    def from_settings(cls, settings):        db = MySQLdb.connect(settings["MYSQL_HOST"], settings["MYSQL_USER"],settings["MYSQL_PASSWD"],settings["MYSQL_DBNAME"], charset="utf8")        return cls(db)    # pipeline默认调用    def process_item(self, item, spider):        cursor = self.db.cursor()        item["pid"] = int(item["pid"])        if isinstance(item, HistoryItem):            try:                cursor.execute('insert into history(pid,name,v2013,v2014,v2015) VALUES (%d, "%s", "%s", "%s", "%s")' % (item['pid'], item["name"], item["v2013"], item["v2014"], item["v2015"]))            except BaseException as e:                self.db.rollback()                logger.error(str(item["pid"]) + ":HistoryItem:" + e.__str__())            pass        elif isinstance(item, ForecastItem):            cursor.execute("select id from forecast where pid = %d" % (item["pid"]))            ret = cursor.fetchone()            try:                if ret:  # 已有pid 更新                    cursor.execute('update  ..........'))                else                    cursor.execute('insert  ..........'))                except BaseException as e:                self.db.rollback()                logger.error(str(item["pid"]) + ":ForecastItem:" + e.__str__())            pass        else:            pass        return item    def close_spider(self, spider):        self.db.close()

当字段众多时如何更简便的保存到数据库?知道的留言告知下多谢。(17年8月28日更新:突然想到ORM,百度了一下果然python也有ORM框架SQLAlchemy、Django,以后可以试下怎么用)
拼接表的字段

SELECT group_concat(COLUMN_NAME) from  information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';//得到逗号分隔的字符串SELECT group_concat(COLUMN_NAME SEPARATOR "='%s',") from  information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';

5 Settings (截取我设置了的)

# -*- coding: utf-8 -*-BOT_NAME = 'wanglu'SPIDER_MODULES = ['wanglu.spiders']NEWSPIDER_MODULE = 'wanglu.spiders'# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)CONCURRENT_REQUESTS = 1# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html#ITEM_PIPELINES = {#    'wanglu.pipelines.SomePipeline': 300,#}#ITEM_PIPELINES = {'wanglu.pipelines.WangluPipeline': 300}#ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline': 400}#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300,'wanglu.pipelines.MySQLPipeline': 400}#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300}ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline2': 300}# start MySQL database configure settingMYSQL_HOST = 'localhost'MYSQL_DBNAME = 'test'MYSQL_USER = 'root'MYSQL_PASSWD = '123123'# end of MySQL database configure setting