scrapy验证码登录爬取知乎
来源:互联网 发布:app软件开发justep 编辑:程序博客网 时间:2024/06/03 16:52
mysql异步操作存储加增量式爬虫,简单的验证码登录
spider
# -*- coding: utf-8 -*-import jsonfrom zheye import zheyeimport scrapyimport reimport timefrom PIL import Imagefrom urllib import parsefrom scrapy.loader import ItemLoaderfrom article_spider.items import ZhihuQuestionItemclass ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['zhihu.com'] start_urls = ['https://www.zhihu.com/'] # start_answer_url = 'https://www.zhihu.com/api/v4/questions/{}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={}&offset={}' headers = { "HOST": "www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" } def parse(self, response): # 搜索全文的a标签里面的href值 all_urls = response.css('a::attr(href)').extract() # 知乎网站所有路径都是相对路径,需要urllib包自动拼接 all_urls = [parse.urljoin(response.url, url) for url in all_urls] # all_urls数组列表里面每一个值(x)以https://www.zhihu.com/question/开头的返回true,并放到all_urls里面,不满足的会被过滤 all_urls = filter(lambda x:True if x.startswith('https://www.zhihu.com/question/') else False, all_urls) for url in all_urls: # 把遍历过后的网址分成两组 match_obj = re.match('(.*question/(\d+))(/|$)', url) if match_obj: request_url = match_obj.group(1) # 请求url question_id = match_obj.group(2) # 问题id yield scrapy.Request(request_url, headers=self.headers, meta={'question_id': question_id}, callback=self.parse_question) else: # 如果不是正确的url,继续交给downloader执行 yield scrapy.Request(url, headers=self.headers) def parse_question(self, response): item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) question_id = response.meta.get('question_id', '') ''' a = response.text 测试用 a.encode('utf-8') ''' item_loader.add_xpath('title', '//h1[@class="QuestionHeader-title"]/text()') item_loader.add_xpath('content', '//span[@class="RichText"]/text()') item_loader.add_value('zhihu_id', question_id) item_loader.add_value('url', response.url) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item_loader.add_xpath('topics', '//div[@class="Popover"]/div/text()') article_item = item_loader.load_item() # 此处解析函数有bug,目前没有时间弄,等以后有时间的!!!!!!! # yield scrapy.Request(self.start_answer_url.format(question_id, 20, 40), headers=self.headers, callback=self.parse_answer) yield article_item # def parse_answer(self, response): # ans_json = json.loads(response.text) # is_end = ans_json['paging']['is_end'] # next_url = ans_json['paging']['next'] # for answer in ans_json['data']: # answer_item = ZhihuAnswerItem() # answer_item['zhihu_id'] = answer['id'] # answer_item['url'] = answer['url'] # answer_item['question_id'] = answer['question']['id'] # answer_item['author_id ']= answer['author']['id'] if 'id' in answer_item['author'] else None # answer_item['content'] = answer['content'] if 'content' in answer else None # answer_item['parise_num ']= answer['voteup_count'] # answer_item['comments_num'] = answer['comment_count'] # answer_item['create_time'] = answer['created_time'] # answer_item['update_time'] = answer['updated_time'] # answer_item['crawl_time ']= datetime.datetime.now() # yield answer_item # if not is_end: # yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer) def start_requests(self): """ 程序的入口 可以完成表单提交 传递登录页面到下一个函数,保持session.cookies状态 """ return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] def login(self, response): """ 输入错误的账号和密码,可以判断出登录也需要提交xsrf 获取_xsrf, 传递登录参数和验证码给下个函数 """ response_text = response.text match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) if match_obj: xsrf = match_obj.group(1) if xsrf: post_data = { '_xsrf': xsrf, 'phone_num': '18645959590', 'password': '6388815', 'captcha': '' } t = str(int(time.time())) # 手动验证码入口 captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) yield scrapy.Request(captcha_url, headers=self.headers, meta={'post_data': post_data}, callback=self.login_after_captcha) # 文字倒立验证码入口 # captcha_url_cn = "https://www.zhihu.com/captcha.gif?r={}&type=login&lang=cn".format(t) # yield scrapy.Request(captcha_url_cn, headers=self.headers, meta={'post_data': post_data}, callback=self.login_after_captcha_cn) def login_after_captcha_cn(selfs, response): with open('captcha.jpg', 'wb') as f: # 下载图片必须以二进制来传输 f.write(response.body) f.close() # 导入者也库,知乎倒立文字验证码识别 z = zheye() positions = z.Recognize('captcha.jpg') """ 者也传过来的坐标是倒序的,且数组的里面的值也是倒序的 最先识别的数组是图片的最后一个倒立文字,数组里面的数分别是y和x """ pos_arr = [] # 有时候倒立一个字,有时候倒立两个字,根据传来的数组个数来决定走向 if len(positions) == 2: # 如果第一个数组中的第二个数比第二个数组中的第二个大,也就是者也默认输出的 # 举个例子[ [43,101],[52,194] ] # 排序前 if positions[0][1] > positions[1][1]: # 把第二个数组放到最前面,再把里面的值互换位置,变成x轴和y轴 pos_arr.append([positions[1][1], positions[1][0]]) # 此时pos_arr列表是: [ [194,52] ] # 第一个数组放到最后面,[0][1]放到前面,[0][0]放到后面 pos_arr.append([positions[0][1], positions[0][0]]) # 此时pos_arr列表是: [ [194,52],[101,43] ] # 排序后 # 光是倒序,数值正常的 else: # [ [101,43],[194,52] ] 排序前 pos_arr.append(positions[0][1], positions[0][0]) pos_arr.append(positions[1][1], positions[1][0]) # [ [43,101],[52,194] ] 排序后 # 只有一个数组的时候 else: pos_arr.append([positions[0][1], positions[0][0]]) post_data = response.meta.get('post_data', {}) if len(positions) == 2: post_data['captcha'] = '{"img_size": [200, 44], "input_points": [[%.2f, %f], [%.2f, %f]]}' % \ (pos_arr[0][0] / 2, pos_arr[0][1] / 2, pos_arr[1][0] / 2, pos_arr[1][1] / 2), else: post_data['captcha'] = '{"img_size": [200, 44], "input_points": [[%.2f, %f]}' % \ (pos_arr[0][0] / 2, pos_arr[0][1] / 2) post_data['captcha_type'] = 'cn' post_url = 'https://www.zhihu.com/login/phone_num' # 最终提交用scrapy.FormRequest,参数也得是formdata return [scrapy.FormRequest( post_url, headers=selfs.headers, formdata=post_data, callback=selfs.check_login )] def login_after_captcha(self, response): # 一次session就是一次对话,长连接,第一次访问完之后下次在访问的时候直接带过去 # 访问知乎的时候,不管有没有登录,都会在session的cookies里面放一些值,自带的 # 其中就包括_xsrf和服务器设置的一些值 # 拿requests去访问,实际上是单独的在建立一个session,这两次的session值是不一样的 # 这两次的函数调用图片的就不匹配了 with open('captcha.jpg', 'wb') as f: # 下载图片必须以二进制来传输 f.write(response.body) f.close() try: im = Image.open('captcha.jpg') im.show() im.close() except: pass captcha = input('请输入验证码:\n') post_data = response.meta.get('post_data', {}) post_data['captcha'] = captcha post_url = 'https://www.zhihu.com/login/phone_num' # 最终提交用scrapy.FormRequest,参数也得是formdata return [scrapy.FormRequest( post_url, headers=self.headers, formdata=post_data, callback=self.check_login )] def check_login(self, response): text_json = json.loads(response.text) print(text_json) if 'msg' in text_json and text_json['msg'] == '登录成功': for url in self.start_urls: yield scrapy.Request(url, dont_filter=True, headers=self.headers)
settings.py
# -*- coding: utf-8 -*-import os# Scrapy settings for article_spider project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'article_spider'SPIDER_MODULES = ['article_spider.spiders']NEWSPIDER_MODULE = 'article_spider.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'article_spider (+http://www.yourdomain.com)'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 0.5# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)COOKIES_ENABLED = TrueCOOKIES_DEBUG = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'article_spider.middlewares.ArticleSpiderSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'article_spider.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { # 异步化添加数据库 'article_spider.pipelines.MysqlTwistedPipeline': 1,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'MYSQL_HOST = 'localhost'MYSQL_DBNAME = 'article_spider'MYSQL_USER = 'root'MYSQL_PASSWORD = 'mysql'SQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'SQL_DATE_FORMAT = '%Y-%m-%d'
items.py
import scrapyfrom scrapy.loader.processors import MapCompose, TakeFirst, Joinfrom scrapy.loader import ItemLoaderfrom article_spider.utils.common import exreact_numfrom article_spider.settings import SQL_DATE_FORMAT,SQL_DATETIME_FORMATimport datetimeimport reclass ZhihuQuestionItem(scrapy.Item): zhihu_id = scrapy.Field() topics = scrapy.Field() url = scrapy.Field() title = scrapy.Field() content = scrapy.Field() answer_num = scrapy.Field() comments_num = scrapy.Field() watch_user_num = scrapy.Field() click_num = scrapy.Field() crawl_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on DUPLICATE KEY UPDATE answer_num = VALUES(answer_num), comments_num = VALUES (comments_num) """ zhihu_id = int(''.join(self['zhihu_id'])) topics = ','.join(self['topics']) url = self['url'][0] title = ''.join(self['title']) content = '空'.join(self['content']) answer_num = exreact_num(''.join(self['answer_num'])) comments_num = exreact_num(''.join(self['comments_num'])) watch_user_num = exreact_num(''.join(self['watch_user_num'])) crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT) if len(self["watch_user_num"]) == 2: watch_user_num = int(self["watch_user_num"][0]) click_num = int(self["watch_user_num"][1]) else: watch_user_num = int(self["watch_user_num"][0]) click_num = 0 params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) return insert_sql, params
pipelines.py
class MysqlTwistedPipeline(object): """Mysql异步化操作""" def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host = settings['MYSQL_HOST'], db = settings['MYSQL_DBNAME'], user = settings['MYSQL_USER'], passwd = settings['MYSQL_PASSWORD'], charset = 'utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode = True ) dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) # 处理异常 query.addErrback(self.handle_error, item, spider) def handle_error(self, failure, item, spider): # 处理异步插入的异常 print (failure) def do_insert(self, cursor, item): # 执行具体的插入 # 根据不同的item 构建不同的sql语句并插入到mysql中 # insert_sql, params = item.get_insert_sql() # cursor.execute(insert_sql, params) insert, params = item.get_insert_sql() cursor.execute(insert, params)
阅读全文
0 0
- scrapy验证码登录爬取知乎
- scrapy打造知乎后花园一: 验证码登录
- Scrapy框架爬取有验证码的登录网站
- Scrapy中使用cookie免于验证登录和模拟登录
- scrapy 登录
- python爬虫scrapy框架——人工识别知乎登录知乎倒立文字验证码和数字英文验证码
- python爬虫scrapy框架——人工识别登录知乎倒立文字验证码和数字英文验证码(2)
- 登录验证(验证码)
- 登录验证码
- 登录验证码C#
- 验证码登录代码
- 登录之验证码
- 登录 图形验证码
- struts2登录验证码
- JSP 登录验证码
- 用户登录验证码
- 登录随机验证码
- 注册登录验证码
- eclipse jdk环境变量配置
- Jmeter打不开问题系列
- 使用Git Submodule管理子模块
- linux下vi常用命令
- 【android】App全屏设置,内置状态栏。
- scrapy验证码登录爬取知乎
- java-重写hashCode与equals
- java对地址字符串脱敏
- mongoDB的Criteria查询:多表联合查询
- C++ 资料
- npm镜像淘宝命令
- lambda表达式
- 招投标,你应该这样做!
- 学Python,当然是选择从helloworld开始啦~