scrapy验证码登录爬取知乎

来源:互联网 发布:app软件开发justep 编辑:程序博客网 时间:2024/06/03 16:52

mysql异步操作存储加增量式爬虫,简单的验证码登录

spider

# -*- coding: utf-8 -*-import jsonfrom zheye import zheyeimport scrapyimport reimport timefrom PIL import Imagefrom urllib import parsefrom scrapy.loader import ItemLoaderfrom article_spider.items import ZhihuQuestionItemclass ZhihuSpider(scrapy.Spider):    name = 'zhihu'    allowed_domains = ['zhihu.com']    start_urls = ['https://www.zhihu.com/']    # start_answer_url = 'https://www.zhihu.com/api/v4/questions/{}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={}&offset={}'    headers = {        "HOST": "www.zhihu.com",        "Referer": "https://www.zhizhu.com",        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"    }    def parse(self, response):        # 搜索全文的a标签里面的href值        all_urls = response.css('a::attr(href)').extract()        # 知乎网站所有路径都是相对路径,需要urllib包自动拼接        all_urls = [parse.urljoin(response.url, url) for url in all_urls]        # all_urls数组列表里面每一个值(x)以https://www.zhihu.com/question/开头的返回true,并放到all_urls里面,不满足的会被过滤        all_urls = filter(lambda x:True if x.startswith('https://www.zhihu.com/question/') else False, all_urls)        for url in all_urls:            # 把遍历过后的网址分成两组            match_obj = re.match('(.*question/(\d+))(/|$)', url)            if match_obj:                request_url = match_obj.group(1)        # 请求url                question_id = match_obj.group(2)        # 问题id                yield scrapy.Request(request_url, headers=self.headers, meta={'question_id': question_id}, callback=self.parse_question)            else:                # 如果不是正确的url,继续交给downloader执行                yield scrapy.Request(url, headers=self.headers)    def parse_question(self, response):        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)        question_id = response.meta.get('question_id', '')        '''        a = response.text       测试用        a.encode('utf-8')           '''        item_loader.add_xpath('title', '//h1[@class="QuestionHeader-title"]/text()')        item_loader.add_xpath('content', '//span[@class="RichText"]/text()')        item_loader.add_value('zhihu_id', question_id)        item_loader.add_value('url', response.url)        item_loader.add_css('answer_num', '.List-headerText span::text')        item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')        item_loader.add_css('watch_user_num', '.NumberBoard-value::text')        item_loader.add_xpath('topics', '//div[@class="Popover"]/div/text()')        article_item = item_loader.load_item()        # 此处解析函数有bug,目前没有时间弄,等以后有时间的!!!!!!!        # yield scrapy.Request(self.start_answer_url.format(question_id, 20, 40), headers=self.headers, callback=self.parse_answer)        yield article_item    # def parse_answer(self, response):    #     ans_json = json.loads(response.text)    #     is_end = ans_json['paging']['is_end']    #     next_url = ans_json['paging']['next']    #     for answer in ans_json['data']:    #         answer_item = ZhihuAnswerItem()    #         answer_item['zhihu_id'] = answer['id']    #         answer_item['url'] = answer['url']    #         answer_item['question_id'] = answer['question']['id']    #         answer_item['author_id ']= answer['author']['id'] if 'id' in answer_item['author'] else None    #         answer_item['content'] = answer['content'] if 'content' in answer else None    #         answer_item['parise_num ']= answer['voteup_count']    #         answer_item['comments_num'] = answer['comment_count']    #         answer_item['create_time'] = answer['created_time']    #         answer_item['update_time'] = answer['updated_time']    #         answer_item['crawl_time ']= datetime.datetime.now()    #         yield answer_item        # if not is_end:        #     yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)    def start_requests(self):        """        程序的入口        可以完成表单提交        传递登录页面到下一个函数,保持session.cookies状态        """        return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]    def login(self, response):        """        输入错误的账号和密码,可以判断出登录也需要提交xsrf        获取_xsrf,  传递登录参数和验证码给下个函数        """        response_text = response.text        match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)        if match_obj:            xsrf = match_obj.group(1)        if xsrf:            post_data = {                '_xsrf': xsrf,                'phone_num': '18645959590',                'password': '6388815',                'captcha': ''            }            t = str(int(time.time()))            # 手动验证码入口            captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)            yield scrapy.Request(captcha_url, headers=self.headers, meta={'post_data': post_data}, callback=self.login_after_captcha)            # 文字倒立验证码入口            # captcha_url_cn = "https://www.zhihu.com/captcha.gif?r={}&type=login&lang=cn".format(t)            # yield scrapy.Request(captcha_url_cn, headers=self.headers, meta={'post_data': post_data}, callback=self.login_after_captcha_cn)    def login_after_captcha_cn(selfs, response):        with open('captcha.jpg', 'wb') as f:            # 下载图片必须以二进制来传输            f.write(response.body)            f.close()        # 导入者也库,知乎倒立文字验证码识别        z = zheye()        positions = z.Recognize('captcha.jpg')        """        者也传过来的坐标是倒序的,且数组的里面的值也是倒序的        最先识别的数组是图片的最后一个倒立文字,数组里面的数分别是y和x        """        pos_arr = []        # 有时候倒立一个字,有时候倒立两个字,根据传来的数组个数来决定走向        if len(positions) == 2:            # 如果第一个数组中的第二个数比第二个数组中的第二个大,也就是者也默认输出的            # 举个例子[   [43,101],[52,194]   ]         # 排序前            if positions[0][1] > positions[1][1]:                # 把第二个数组放到最前面,再把里面的值互换位置,变成x轴和y轴                pos_arr.append([positions[1][1], positions[1][0]])                # 此时pos_arr列表是: [  [194,52]  ]                # 第一个数组放到最后面,[0][1]放到前面,[0][0]放到后面                pos_arr.append([positions[0][1], positions[0][0]])                # 此时pos_arr列表是: [   [194,52],[101,43]   ]           # 排序后            # 光是倒序,数值正常的            else:                # [   [101,43],[194,52]   ]         排序前                pos_arr.append(positions[0][1], positions[0][0])                pos_arr.append(positions[1][1], positions[1][0])                # [   [43,101],[52,194]   ]         排序后        # 只有一个数组的时候        else:            pos_arr.append([positions[0][1], positions[0][0]])        post_data = response.meta.get('post_data', {})        if len(positions) == 2:            post_data['captcha'] = '{"img_size": [200, 44], "input_points": [[%.2f, %f], [%.2f, %f]]}' % \                                   (pos_arr[0][0] / 2, pos_arr[0][1] / 2, pos_arr[1][0] / 2, pos_arr[1][1] / 2),        else:            post_data['captcha'] = '{"img_size": [200, 44], "input_points": [[%.2f, %f]}' % \                                   (pos_arr[0][0] / 2, pos_arr[0][1] / 2)        post_data['captcha_type'] = 'cn'        post_url = 'https://www.zhihu.com/login/phone_num'        # 最终提交用scrapy.FormRequest,参数也得是formdata        return [scrapy.FormRequest(            post_url,            headers=selfs.headers,            formdata=post_data,            callback=selfs.check_login        )]    def login_after_captcha(self, response):        # 一次session就是一次对话,长连接,第一次访问完之后下次在访问的时候直接带过去        # 访问知乎的时候,不管有没有登录,都会在session的cookies里面放一些值,自带的        # 其中就包括_xsrf和服务器设置的一些值        # 拿requests去访问,实际上是单独的在建立一个session,这两次的session值是不一样的        # 这两次的函数调用图片的就不匹配了        with open('captcha.jpg', 'wb') as f:            # 下载图片必须以二进制来传输            f.write(response.body)            f.close()        try:            im = Image.open('captcha.jpg')            im.show()            im.close()        except:            pass        captcha = input('请输入验证码:\n')        post_data = response.meta.get('post_data', {})        post_data['captcha'] = captcha        post_url = 'https://www.zhihu.com/login/phone_num'        # 最终提交用scrapy.FormRequest,参数也得是formdata        return [scrapy.FormRequest(            post_url,            headers=self.headers,            formdata=post_data,            callback=self.check_login        )]    def check_login(self, response):        text_json = json.loads(response.text)        print(text_json)        if 'msg' in text_json and text_json['msg'] == '登录成功':            for url in self.start_urls:                yield scrapy.Request(url, dont_filter=True, headers=self.headers)

settings.py

# -*- coding: utf-8 -*-import os# Scrapy settings for article_spider project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:##     http://doc.scrapy.org/en/latest/topics/settings.html#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'article_spider'SPIDER_MODULES = ['article_spider.spiders']NEWSPIDER_MODULE = 'article_spider.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'article_spider (+http://www.yourdomain.com)'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 0.5# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)COOKIES_ENABLED = TrueCOOKIES_DEBUG = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',#   'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#    'article_spider.middlewares.ArticleSpiderSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {#    'article_spider.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {#    'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = {   # 异步化添加数据库   'article_spider.pipelines.MysqlTwistedPipeline': 1,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'MYSQL_HOST = 'localhost'MYSQL_DBNAME = 'article_spider'MYSQL_USER = 'root'MYSQL_PASSWORD = 'mysql'SQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'SQL_DATE_FORMAT = '%Y-%m-%d'

items.py

import scrapyfrom scrapy.loader.processors import MapCompose, TakeFirst, Joinfrom scrapy.loader import ItemLoaderfrom article_spider.utils.common import exreact_numfrom article_spider.settings import SQL_DATE_FORMAT,SQL_DATETIME_FORMATimport datetimeimport reclass ZhihuQuestionItem(scrapy.Item):    zhihu_id = scrapy.Field()    topics = scrapy.Field()    url = scrapy.Field()    title = scrapy.Field()    content = scrapy.Field()    answer_num = scrapy.Field()    comments_num = scrapy.Field()    watch_user_num = scrapy.Field()    click_num = scrapy.Field()    crawl_time = scrapy.Field()    def get_insert_sql(self):        insert_sql = """          insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,          watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)          on DUPLICATE KEY UPDATE answer_num = VALUES(answer_num), comments_num = VALUES (comments_num)        """        zhihu_id = int(''.join(self['zhihu_id']))        topics = ','.join(self['topics'])        url = self['url'][0]        title = ''.join(self['title'])        content = '空'.join(self['content'])        answer_num = exreact_num(''.join(self['answer_num']))        comments_num = exreact_num(''.join(self['comments_num']))        watch_user_num = exreact_num(''.join(self['watch_user_num']))        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)        if len(self["watch_user_num"]) == 2:            watch_user_num = int(self["watch_user_num"][0])            click_num = int(self["watch_user_num"][1])        else:            watch_user_num = int(self["watch_user_num"][0])            click_num = 0        params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time)        return insert_sql, params

pipelines.py

class MysqlTwistedPipeline(object):    """Mysql异步化操作"""    def __init__(self, dbpool):        self.dbpool = dbpool    @classmethod    def from_settings(cls, settings):        dbparms = dict(            host = settings['MYSQL_HOST'],            db = settings['MYSQL_DBNAME'],            user = settings['MYSQL_USER'],            passwd = settings['MYSQL_PASSWORD'],            charset = 'utf8',            cursorclass = MySQLdb.cursors.DictCursor,            use_unicode = True        )        dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)        return cls(dbpool)    def process_item(self, item, spider):        query = self.dbpool.runInteraction(self.do_insert, item)        # 处理异常        query.addErrback(self.handle_error, item, spider)    def handle_error(self, failure, item, spider):        # 处理异步插入的异常        print (failure)    def do_insert(self, cursor, item):        # 执行具体的插入        # 根据不同的item 构建不同的sql语句并插入到mysql中        # insert_sql, params = item.get_insert_sql()        # cursor.execute(insert_sql, params)        insert, params = item.get_insert_sql()        cursor.execute(insert, params)