文章标题

来源:互联网 发布:怎样挑选淘宝剪标衣服 编辑:程序博客网 时间:2024/06/04 01:31
import scrapyimport timeclass ZhihuSpider(scrapy.Spider):    name = 'zhihu'    allowed_domains = ['zhihu.com']    start_urls = ['https://www.zhihu.com/#signin']    def parse(self, response):        # 1.先发get请求,获取登录参数        _xsrf = response.css("input[name=_xsrf]::attr(value)").extract()[0]        #response.xpath("//input[@name='_xsrf']/@value").extract()[0]        # 2. 处理验证码请求        captcha_url = "https://www.zhihu.com/captcha.gif?r=" + str(int(time.time() * 1000)) + "&type=login"        yield scrapy.Request(captcha_url, meta = {"_xsrf" : _xsrf}, callback = self.zhihu_login)    def parse_captcha(self, response):        with open("captcha.png", "wb") as f:            f.write(response.body)        captcha = raw_input("请输入验证码:")        # 返回验证码字符串        return captcha    # 这个response是验证码图片的响应    def zhihu_login(self, response):        _xsrf = response.meta["_xsrf"]        data = {            "_xsrf":_xsrf,            "email":"123636274@qq.com",            "password":"ALARMCHIME",            "remember_me": "True",            "captcha": self.parse_captcha(response)        }        # 3. 获取所有必须的数据后,发送post请求登录        yield scrapy.FormRequest(            "https://www.zhihu.com/login/email",            formdata = data,            callback = self.after_login        )    def after_login(self, response):        #print response.body        # 4. 登录成功后,已经获取了Cookie,再去发送其他页面的请求即可        yield scrapy.Request("https://www.zhihu.com/settings/account", callback = self.parse_item)    def parse_item(self, response):        with open("my_zhihu.html", "w") as f:            f.write(response.body)
原创粉丝点击