Scrapy 使用记录
来源:互联网 发布:centos 桥接模式 编辑:程序博客网 时间:2024/06/01 22:15
官方文档链接
1.items定义
class HistoryItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pid = scrapy.Field() # 项目id name = scrapy.Field() # 项目class ForecastItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pid = scrapy.Field() # 项目id name = scrapy.Field() # 项目
2.文件编码
# coding=utf-8# 文件开头import sysreload(sys)sys.setdefaultencoding('utf-8')
3.spider 写法演示(对原文件进行了修改删减,仅做演示)
# coding=utf-8import scrapyimport sysimport cgifrom wanglu.items import ForecastItem, SupplyItemreload(sys)sys.setdefaultencoding('utf-8')# scrapy crawl wang 命令class WangSpider(scrapy.Spider): name = "wang" base_url= "http://www.sample.com/info?id=" #自己的网址 start_urls =[] #从文件seedall.txt读id参数,每行一个 # def __init__(self): # ids = open("seedall.txt","r").readlines() # for did in ids: # self.start_urls.append(self.base_url+did.replace("\n", "").strip()) #循环 id def __init__(self): for did in range(22312,23475): self.start_urls.append(self.base_url+str(did)) def parse(self,response): try: time.sleep(1) except BaseException as e: e.__str__() pid = response.url.split("=")[-1] table = response.xpath("/html/body/div[@id='basicInfoDiv']/div/table[@class='jbxx_table'][1]") item = WangluItem() item["pid"]=response.url.split("=")[-1] item['company_name']=table.xpath(".//td/label[@id='lblcname']/text()")[0].extract().strip().replace("\r\n", "") if item["company_name"].strip() == '': return yield item forecastTable =response.css("#businessPlanDiv div > table")[1] if forecastTable and forecastTable.css("tr"): for i,tr in enumerate(forecastTable.css("tr")): if i > 1: forecast = Forecast2Item() forecast["pid"] = pid forecast['year'] = cgi.escape( tr.xpath(".//td[1]/text()")[0].extract().strip().replace("\r\n", ""),True) forecast['edu_bg'] = cgi.escape(tr.xpath('.//td[1]')[0].extract().strip().replace("\r\n", ""),True) yield forecast
选择器文档链接
3.1 xpath 浏览器工具可以查看,下图为360浏览器获取xpath
但是这样获取的并不一定有效,而且可能很长,经常需要根据自己的情况修改xpath.
tbody 是一个小坑,因为源返回值中可能没有tbody ,但是浏览器会自动加上tbody, 造成路径问题,要注意。
3.2 css 选择器, “>” 直接子元素 ,”:nth-child()” 我较常用
注意 :xpath和css方法返回的都是”数组”
cgi.escape(str,True) 将html特殊符号和引号转义,在保存数据到数据库是很有用
取文本 :
tr.xpath(".//td/text()") tr.css("td:: text")
取元素 (td中包含其他标签时):
tr.xpath(".//td") tr.css("td")
取属性:
supply['file_url'] =tr.xpath(".//td[3]/a/@href")supply['file_url'] =tr.css("a::attr(href)")
更多更详细请查文档
4.pipelines
log
import logginglogger = logging.getLogger('WLog')logger.setLevel(logging.DEBUG)ch = logging.StreamHandler()ch.setLevel(logging.DEBUG)fh = logging.FileHandler("error.log", "a+", "utf-8")fh.setLevel(logging.DEBUG)formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')ch.setFormatter(formatter)fh.setFormatter(formatter)# 给logger添加handler# self.logger.addHandler(ch)logger.addHandler(fh)
将结果以json格式保存到文件
import jsonclass FilePipeline(object): def __init__(self): self.file = open('items.jl', 'a+') def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line) return item def close_spider(self, spider): self.file.close()
保存到EXCEL
from openpyxl import Workbookclass ExcelPipeline(object): def __init__(self): try: self.wb = Workbook() # grab the active worksheet self.ws = self.wb.active except BaseException as e: logger.warn(e.__str__()) def process_item(self, item, spider): try: self.ws.append([item['did'], item["company_name"], item["industry"], item["area"]]) except BaseException as e: logger.warn(item["did"] + ":" + e.__str__()) return item def close_spider(self, spider): try: self.wb.save(time.strftime("%y%m%d%H%M%S") + ".xlsx") except BaseException as e: logger.warn(e.__str__())
保存到数据库
import loggingimport MySQLdb# 保存到数据库class MySQLPipeline(object): def __init__(self, db): self.db = db @classmethod def from_settings(cls, settings): db = MySQLdb.connect(settings["MYSQL_HOST"], settings["MYSQL_USER"],settings["MYSQL_PASSWD"],settings["MYSQL_DBNAME"], charset="utf8") return cls(db) # pipeline默认调用 def process_item(self, item, spider): cursor = self.db.cursor() item["pid"] = int(item["pid"]) if isinstance(item, HistoryItem): try: cursor.execute('insert into history(pid,name,v2013,v2014,v2015) VALUES (%d, "%s", "%s", "%s", "%s")' % (item['pid'], item["name"], item["v2013"], item["v2014"], item["v2015"])) except BaseException as e: self.db.rollback() logger.error(str(item["pid"]) + ":HistoryItem:" + e.__str__()) pass elif isinstance(item, ForecastItem): cursor.execute("select id from forecast where pid = %d" % (item["pid"])) ret = cursor.fetchone() try: if ret: # 已有pid 更新 cursor.execute('update ..........')) else cursor.execute('insert ..........')) except BaseException as e: self.db.rollback() logger.error(str(item["pid"]) + ":ForecastItem:" + e.__str__()) pass else: pass return item def close_spider(self, spider): self.db.close()
当字段众多时如何更简便的保存到数据库?知道的留言告知下多谢。(17年8月28日更新:突然想到ORM,百度了一下果然python也有ORM框架SQLAlchemy、Django,以后可以试下怎么用)
拼接表的字段
SELECT group_concat(COLUMN_NAME) from information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';//得到逗号分隔的字符串SELECT group_concat(COLUMN_NAME SEPARATOR "='%s',") from information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';
5 Settings (截取我设置了的)
# -*- coding: utf-8 -*-BOT_NAME = 'wanglu'SPIDER_MODULES = ['wanglu.spiders']NEWSPIDER_MODULE = 'wanglu.spiders'# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)CONCURRENT_REQUESTS = 1# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html#ITEM_PIPELINES = {# 'wanglu.pipelines.SomePipeline': 300,#}#ITEM_PIPELINES = {'wanglu.pipelines.WangluPipeline': 300}#ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline': 400}#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300,'wanglu.pipelines.MySQLPipeline': 400}#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300}ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline2': 300}# start MySQL database configure settingMYSQL_HOST = 'localhost'MYSQL_DBNAME = 'test'MYSQL_USER = 'root'MYSQL_PASSWD = '123123'# end of MySQL database configure setting
阅读全文
0 0
- 【scrapy】使用记录
- scrapy使用记录
- Scrapy 使用记录
- scrapy 记录
- scrapy使用
- scrapy 排错记录
- scrapy学习记录0401
- Scrapy 入门记录(1)
- 记录:安装scrapy与pywebkitgtk
- python Scrapy环境安装记录
- scrapy-redis集成scrapy-splash使用教程
- Scrapy使用心得
- 使用scrapy爬取整站图片
- scrapy 使用代理
- Scrapy的使用
- scrapy 使用代理
- scrapy框架基本使用
- scrapy使用笔记
- SocketIO接收中文乱码
- Linux --> 安装centos7虚拟机 for mac
- Android学习日记(yzy):通过Timer类和Handler机制实现gif图片
- 常用string原型扩展
- 新手之javase基础 笔记
- Scrapy 使用记录
- WindowManager——悬浮在所有app之上
- 微信js_sdk的使用
- 使用ssh实现远程登录
- json_encode与json_decode的详解
- C#基础知识点
- leetcode:valid square
- iwpriv命令收集资料
- Array对象常用方法