2. 初始化拉钩网项目并解读crawl源码
scrapy genspider --list
查看可使用的初始化模板
ailable templates:
- basic
- crawl
- csvfeed
- xmlfeed
scrapy genspider -t crawl lagou www.lagou.com
cmd与pycharm不同,mark root
setting.py 设置目录
crawl模板
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['http://www.lagou.com/'] rules = ( Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), ) def parse_item(self, response): i = {} return i
源码阅读剖析
https://doc.scrapy.org/en/1.3/topics/spiders.html#crawlspider
提供了一些可以让我们进行简单的follow的规则,link,迭代爬取
rules:
规则,crawel spider读取并执行
parse_start_url(response):
example:
rules是一个可迭代对象,里面有Rule实例->LinkExtractor的分析
allow=('category\.PHP', ), callback='parse_item',
allow允许的url模式。callback,要回调的函数名。
因为rules里面没有self,无法获取到方法。
import scrapyfrom scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorclass MySpider(CrawlSpider): name = 'example.com' allowed_domains = ['example.com'] start_urls = ['http://www.example.com'] rules = ( Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))), Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'), ) def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() item['description'] = response.xpath('//td[@id="item_description"]/text()').extract() return item
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
分析拉勾网模板代码
1. 将http加上s
2. 重命名parse_item为我们自定义的parse_job
3. 点击class LagouSpider(CrawlSpider):
的CrawlSpider,进入crawl源码
4. class CrawlSpider(Spider):
可以看出它继承于spider
5. 入口:def start_requests(self):
6. alt+左右方向键,不同代码跳转
7. 5->之后默认parse CrawlSpider里面有parse函数。但是这次我们不能向以前一样覆盖
Crawl.py核心函数parse。
parse函数调用_parse_response
def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
_parse_response
- 判断是否有callback即有没有self.parse_start_url
- 我们可以重载parse_start_url加入自己的处理
- 把参数传递给函数,并调用process_results函数
_parse_response函数
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
parse_start_url的return值将会被process_results方法接收处理
如果不重写,因为返回为空,然后就相当于什么都没做
def process_results(self, response, results): return results
点击followlink
def set_crawler(self, crawler): super(CrawlSpider, self).set_crawler(crawler) self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
如果setting中有这个参数,则可以进一步执行到parse
_requests_to_follow
1. 判断传入的是不是response,如果不是直接returns
2. 针对当前response设置一个空set,去重
3. 把self的rules通过enumerate变成一个可迭代对象
4. 跳转rules详情
5. 拿到link通过link_extractor.extract_links抽取出具体的link
6. 执行我们的process_links
7. link制作完成发起Request,回调_response_downloaded函数
8. 然后执行parse_respose
def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r)
_compile_rules
- 在我们初始化时会调用_compile_rules
copy.copy(r) for r in self.rules]
将我们的rules进行一个copy- 调用回调函数get_method。
- 调用rules里面我们定义的process_links
- 调用rules里面我们定义的process_request
def _compile_rules(self): def get_method(method): if callable(method): return method elif isinstance(method, six.string_types): return getattr(self, method, None) self._rules = [copy.copy(r) for r in self.rules] for rule in self._rules: rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request)
self.process_links = process_links self.process_request = process_request
可以通过在rules里面传入我们自己的处理函数,实现对url的自定义。
达到负载均衡,多地不同ip访问。
_response_downloaded
通过rule取到具体的rule
调用我们自己的回调函数
def _response_downloaded(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
- allow :符合这个url我就爬取
- deny : 符合这个url规则我就放弃
- allow_domin : 这个域名下的我才处理
- allow_domin : 这个域名下的我不处理
- restrict_xpaths:进一步限定xpath
self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None, deny_extensions=None, restrict_css=()
extract_links
如果有restrict_xpaths,他会进行读取执行
def extract_links(self, response): base_url = get_base_url(response) if self.restrict_xpaths: docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) return unique_list(all_links)
get_base_url:
urllib.parse.urljoin替我们拼接好url
def get_base_url(text, baseurl='', encoding='utf-8'): """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. """ text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: return moves.urllib.parse.urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) ) else: return safe_url_string(baseurl)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
编写rule规则
rules = ( Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True), Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), )
- 我们编写的spider,然后yield一个request发送给engine
- engine拿到什么都不做然后给scheduler
- engine会生成一个request给engine
- engine拿到之后通过downloadermiddleware 给downloader
- downloader再发送response回来给engine。
- engine拿到之后,response给spider。
- spider进行处理,解析出item & request,
- item->给itempipeline;如果是request,跳转步骤二
path:articlespider3\Lib\site-packages\scrapy\core
- engine.py:
- scheduler.py
downloader
item
- pipeline
- spider
engine.py:重要函数schedule
- enqueue_request:把request放scheduler
- _next_request_from_scheduler:从调度器拿。
def schedule(self, request, spider): self.signals.send_catch_log(signal=signals.request_scheduled, request=request, spider=spider) if not self.slot.scheduler.enqueue_request(request): self.signals.send_catch_log(signal=signals.request_dropped, request=request, spider=spider)
articlespider3\Lib\site-packages\scrapy\core\downloader\handlers
支持文件,ftp,http下载(https).
后期定制middleware:
- spidermiddlewire
- downloadmiddlewire
django和scrapy结构类似
3. scrapy的两个重要类:request和response
类似于django httprequest
yield Request(url=parse.urljoin(response.url, post_url))
request参数:
class Request(object_ref): def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None):
cookies:
Lib\site-packages\scrapy\downloadermiddlewares\cookies.py
cookiejarkey = request.meta.get("cookiejar")
- priority: 优先级,影响调度顺序
- dont_filter:我的同样的request不会被过滤
- errback:错误时的回调函数
https://doc.scrapy.org/en/1.2/topics/request-response.html?highlight=response
errback example:
class ErrbackSpider(scrapy.Spider): name = "errback_example" start_urls = [ "http://www.httpbin.org/", "http://www.httpbin.org/status/404", "http://www.httpbin.org/status/500", "http://www.httpbin.org:12345/", "http://www.httphttpbinbin.org/", ] def start_requests(self): for u in self.start_urls: yield scrapy.Request(u, callback=self.parse_httpbin, errback=self.errback_httpbin, dont_filter=True) def parse_httpbin(self, response): self.logger.info('Got successful response from {}'.format(response.url)) def errback_httpbin(self, failure): self.logger.error(repr(failure)) if failure.check(HttpError): response = failure.value.response self.logger.error('HttpError on %s', response.url) elif failure.check(DNSLookupError): request = failure.request self.logger.error('DNSLookupError on %s', request.url) elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request self.logger.error('TimeoutError on %s', request.url)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
response类
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None): self.headers = Headers(headers or {})
response的参数:
request:yield出来的request,会放在response,让我们知道它是从哪里来的