分析AJAX抓取今日头条街拍美图(下)

来源:互联网 发布:淘宝无线端是什么意思 编辑:程序博客网 时间:2024/05/21 17:43

视频学习

构造AJAX请求

import requestsfrom urllib.parse import urlencodefrom requests.exceptions import RequestExceptiondef get_page_index(offset,keyword):     #dict     data = {             'offset':offset,             'format':'json',             'keyword':keyword,             'autoload':'true',             'count':'20',             'cur_tab':3             }    #urlencode change the dict to the request parameters    url='http://www.toutiao.com/search_content/?'+urlencode(data)    try:        response = requests.get(url)        if response.status_code==200:            return response.text        return None    except RequestException:        print ("请求索引页出错!")        return None

根据返回的JSON数据提取article_url

import json# parse JSON and get the url for each articledef parse_page_index(html):    data=json.loads(html)    #filter the object without data    if data and 'data' in data.keys():        for item in data.get('data'):            #yield construct generation            yield item.get('article_url')

requests请求某篇文章

#get the any one articledef get_page_detail(url):    try:        response = requests.get(url)        if response.status_code==200:            return response.text        return None    except RequestException:        print ("请求详情页出错!")        return None

gallery遍历中提取图像集的image_url

from bs4 import BeautifulSoupimport redef parse_page_detail(html,url):    print ('详情页解析结果:')    soup = BeautifulSoup(html,'lxml')    title = soup.select('title')[0].get_text()    print (title)    #define regulation and select the mode    images_pattern = re.compile('var gallery = (.*?);',re.S)    result = re.search(images_pattern,html)    #result is a JSON need to parse the image url    if result:        #print (result.group(1))        data=json.loads(result.group(1))        if data and 'sub_images' in data.keys():            sub_images=data.get('sub_images')            #list            images=[item.get('url') for item in sub_images]            return{                    'title':title,                    'url':url,                    'images':images                                    }

综上

def main():    html = get_page_index(0,'街拍')    #print (html)    for url in parse_page_index(html):        print (url)        html=get_page_detail(url)        if html:            result=parse_page_detail(html,url)            print (result)if __name__=='__main__':    main()

运行结果

这里写图片描述

py文件下载

    • 构造AJAX请求
    • 根据返回的JSON数据提取article_url
    • requests请求某篇文章
    • gallery遍历中提取图像集的image_url
    • 综上
    • 运行结果

0 0