知乎抓图python脚本

来源:互联网 发布:vscode php格式化插件 编辑:程序博客网 时间:2024/06/07 06:29
  1. 填写知乎的question_id
  2. 填写图片保存的路径
  3. 运行脚本
#! /usr/bin/env pythonfrom urlparse import urlsplitfrom os.path import basenameimport urllib2import reimport requestsimport osimport jsonquestion_id = '30137203'pic_path = '/Users/xxx/Desktop/pic/'url = 'https://www.zhihu.com/question/' + question_idif not os.path.exists(pic_path):    os.mkdir(pic_path)page_size = 50offset = 0url_content = urllib2.urlopen(url).read()answers = re.findall('h3 data-num="(.*?)"', url_content)print answerslimits = int(answers[0])while offset < limits:    post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"    params = json.dumps({        'url_token': question_id,        'pagesize': page_size,        'offset': offset    })    data = {        '_xsrf': '',        'method': 'next',        'params': params    }    header = {        'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",        'Host': "www.zhihu.com",        'Referer': url    }    response = requests.post(post_url, data=data, headers=header)    answer_list = response.json()["msg"]    img_urls = re.findall('img .*?src="(.*?_b.*?)"', ''.join(answer_list))    for img_url in img_urls:        try:            img_data = urllib2.urlopen(img_url).read()            file_name = basename(urlsplit(img_url)[2])            output = open(pic_path + file_name, 'wb')            print file_name            output.write(img_data)            output.close()        except:            pass    offset += page_size
0 0
原创粉丝点击