scrapy爬虫实战(四)--------------登陆51job并使用cookies进行爬取

来源:互联网 发布:php 模拟发送post请求 编辑:程序博客网 时间:2024/05/29 05:53

本文章代码仅供学习使用,如有侵权请联系作者删除,多谢。


主要通过一个scrapy爬虫,理解如何登陆网站并使用登陆后的cookies继续爬取。

登陆的用户名密码用XXX表示。

# -*- coding: utf-8 -*-import osimport scrapyfrom scrapy.spider import CrawlSpider, Rulefrom scrapy.http.request import Requestfrom scrapy.linkextractors import LinkExtractordef add_cookie(r):    r.meta.update(cookiejar=1)    new_r = r.replace(meta=r.meta)    return new_rclass ExampleSpider(CrawlSpider):    name = "example1"    rules = (        Rule(LinkExtractor(allow='ResumeViewFolder'),process_request=add_cookie,callback='parse_one_candidate',follow=True),        Rule(LinkExtractor(allow='ehire.51job.com',),process_request=add_cookie,follow=True)    )    def start_requests(self):        yield Request('http://ehire.51job.com/MainLogin.aspx',                      callback=self.parse_login_page)    def parse_login_page(self, response):        cookies = {}        cookie_keys = ['hidLangType', 'hidAccessKey', 'hidEhireGuid', 'hidRetUrl', 'fksc', '__VIEWSTATE']        isRememberMe = "false"        for key in cookie_keys:            css_value = "#" + key + "::attr(value)"            try:                cookie_value = response.css(css_value).extract()[0]            except Exception as e:                print("cookies value err", css_value, e)                cookies[key] = ''            else:                cookies[key] = cookie_value        cookies['txtMemberNameCN'] = "xxxx"        cookies['txtUserNameCN'] = 'xxxx'        cookies['txtPasswordCN'] = 'xxxx'        cookies['ctmName'] = "xxxx"        cookies['userName'] = 'xxxx'        cookies['password'] = 'xxxx'        cookies['checkCode'] = ''        cookies['oldAccessKey'] = cookies['hidAccessKey']        cookies['langtype'] = cookies['hidLangType']        cookies['isRememberMe'] = 'false'        cookies['sc'] = cookies['fksc']        cookies['ec'] = cookies['hidEhireGuid']        cookies['returl'] = ''        cookies['referrurl'] = ''        return [            scrapy.FormRequest("https://ehirelogin.51job.com/Member/UserLogin.aspx?",                               formdata=cookies,                               meta={'cookiejar': 1},                               callback=self.login_in)        ]    def login_in(self, response):        self.recored2file(response)        for request in self._requests_to_follow(response):            yield request    def recored2file(self, response):        with open('./login.html','wb') as f:            f.write(response.body)    def parse_one_candidate(self, response):        pass

1 0
原创粉丝点击