爬虫-04-scrapy代码编写
来源:互联网 发布:php公开课 编辑:程序博客网 时间:2024/06/01 10:47
4-8~9 编写spider爬取jobbole的所有文章
# -*- coding: utf-8 -*-import reimport scrapyimport datetimefrom scrapy.http import Requestfrom urllib import parse'''如果是py2 那就是import urlparse'''from g0xukr.ArticleSpider.items import JobBoleArticleItem, ArticleItemLoaderfrom g0xukr.ArticleSpider.utils.common import get_md5class JobboleSpider(scrapy.Spider): name = "jobbole" allowed_domains = ["python.jobbole.com"] start_urls = ['http://python.jobbole.com/all-posts/'] def parse(self, response): """ 1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析 2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse """ #解析列表页中的所有文章url并交给scrapy下载后并进行解析 post_nodes = response.css("#archive .floated-thumb .post-thumb a") for post_node in post_nodes: image_url = post_node.css("img::attr(src)").extract_first("") post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail) '''parse.urljoin(response.url, post_url)补全域名''' #提取下一页并交给scrapy进行下载 next_url = response.css(".next.page-numbers::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) def parse_detail(self, response): article_item = JobBoleArticleItem() #通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") #文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content yield article_item
回到顶部
4-10~12 items设计
一些零散的知识点:
1.meta传递值到item.py文件中
#例如:yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
2.extract_first('')使用
extract_first('') 比 extract()[0]好用,因为后者有风险,如果为空,就会出错。但是前者如果为空设置为' ',所以更好用。
3.response.meta.get()用法
response.meta.get('front_image_url','') 前一个引号是自己定义的名称,后一个空着,这样如果就不会抛异常
4.scrapy自动下载图片pipelines
ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipeline': 1,}
配置:
import osIMAGES_URLS_FIELD='front\_image\_url' #'引号中要是一个列表,是图片地址的字段project_dir=os.path.abspath(os.path.dirname(__file__)) #相对的路径,在其他电脑上也可以IMAGES_STORE=os.path.join(project_dir,'存储图片文件名称') #放在同级settings.py目录下'''如果要实现自己的需求,也可以重载相应的函数达到需求,在pipelines中建立类,继承ImagesPipeline就可以了'''
5.哈希表摘要算法,输出固定长度
python3模版:
def get_md5(url): #传进来url if isinstance(url, str): #判断是不是str,其实是判断是不是Unicode,python3中默认是Unicode编码 url = url.encode("utf-8") #转换成utf-8,哈希只认utf-8 m = hashlib.md5() m.update(url) return m.hexdigest()
python2模版:
# -*- coding:utf-8 -*-import hashlibdef get_md5(url='123'): m = hashlib.md5() m.update(url) return m.hexdigest()
回到顶部
4-13 数据表设计和保存item到json文件
模版:
import codecsimport jsonfrom scrapy.exporters import JsonItemExporterclass JsonWithEncodingPipeline(object): #自定义json文件的导出 def __init__(self): '''''' self.file = codecs.open('article.json', 'w', encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(lines) return item def spider_closed(self, spider): self.file.close()class JsonExporterPipleline(object): #调用scrapy提供的json export导出json文件 def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
回到顶部
4-14~15 通过pipeline保存数据到mysql
模版:
pip install mysqlclint 是mysql的一个驱动
import pymysqlimport pymysql.cursors class MysqlPipeline(object): #采用同步的机制写入mysql def __init__(self): self.conn = pymysql.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into jobbole_article(title, url, create_date, fav_nums) VALUES (%s, %s, %s, %s) """ self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"])) self.conn.commit()from twisted.enterprise import adbapiclass MysqlTwistedPipline(object): '''异步插入mysql''' def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): '''传入settings的参数''' dbparms = dict( host = settings["MYSQL_HOST"], db = settings["MYSQL_DBNAME"], user = settings["MYSQL_USER"], passwd = settings["MYSQL_PASSWORD"], charset='utf8', cursorclass=pymysql.cursors.DictCursor, use_unicode=True, ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): #使用twisted将mysql插入变成异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) #处理异常 def handle_error(self, failure, item, spider): # 处理异步插入的异常 print (failure) def do_insert(self, cursor, item): #执行具体的插入 #根据不同的item 构建不同的sql语句并插入到mysql中 insert_sql, params = item.get_insert_sql() print (insert_sql, params) cursor.execute(insert_sql, params)
回到顶部
4-16~17 scrapy item loader机制
模版:
scrapy item loader机制,便于以后的维护
items.p文件中
import datetimeimport reimport scrapyfrom scrapy.loader import ItemLoaderfrom scrapy.loader.processors import MapCompose, TakeFirst, Joinfrom utils.common import extract_numfrom settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMATfrom w3lib.html import remove_tagsdef add_jobbole(value): return value+"-bobby"def date_convert(value): try: create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() return create_datedef get_nums(value): match_re = re.match(".*?(\d+).*", value) if match_re: nums = int(match_re.group(1)) else: nums = 0 return numsdef return_value(value): return valuedef remove_comment_tags(value): #去掉tag中提取的评论 if "评论" in value: return "" else: return valueclass ArticleItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst()class JobBoleArticleItem(scrapy.Item): title = scrapy.Field() create_date = scrapy.Field( input_processor=MapCompose(date_convert), ) url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field( output_processor=MapCompose(return_value) ) front_image_path = scrapy.Field() praise_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) comment_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) fav_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) tags = scrapy.Field( input_processor=MapCompose(remove_comment_tags), output_processor=Join(",") ) content = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path, praise_nums, comment_nums, tags, content) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums) """ fron_image_url = "" # content = remove_tags(self["content"]) if self["front_image_url"]: fron_image_url = self["front_image_url"][0] params = (self["title"], self["url"], self["create_date"], self["fav_nums"], fron_image_url, self["front_image_path"], self["praise_nums"], self["comment_nums"], self["tags"], self["content"]) return insert_sql, params
spider.py文件中部分代码
def parse_detail(self, response): article_item = JobBoleArticleItem() front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
作者:今孝
出处:http://www.cnblogs.com/jinxiao-pu/p/6721848.html
阅读全文
0 0
- 爬虫-04-scrapy代码编写
- 使用scrapy编写的爬虫
- 使用Scrapy框架编写爬虫
- 初试scrapy编写twitter爬虫
- 使用Scrapy框架编写爬虫
- 使用scrapy编写爬虫入门
- 爬虫Scrapy-04Item Pipeline
- scrapy爬虫
- Scrapy 爬虫
- scrapy爬虫
- 爬虫-scrapy
- 【Scrapy】学习记录3_编写简单爬虫
- ubuntu14.04安装python爬虫框架Scrapy
- Lubuntu14.04(Ubuntu)安装爬虫框架Scrapy
- Ubuntu14.04中安装Scrapy爬虫框架
- Python网络爬虫4 ---- Linux下编写最简单的scrapy网络爬虫项目
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- 零基础写python爬虫之使用Scrapy框架编写爬虫
- 图论——寻找最短路径路线的两种算法
- Visual C++ 2010 入门经典
- OpenJudge百炼-2804-词典-C语言-二分查找
- 大三上学期尝试考研的日子(流水账)
- TypeScript
- 爬虫-04-scrapy代码编写
- 登陆界面的前台与后台
- 问题总结
- 莫队
- 统计在线用户及人数
- 【python】类和对象:一些相关的BIF
- 我编程的起点
- 经典的 X-ray 冠脉造影图像的重建
- 删除列表中重复值,重复值保留一个