python 抓取头条街拍图片

来源:互联网 发布:建模软件哪个好知乎 编辑:程序博客网 时间:2024/05/22 10:31

#抓取头条图片,存入文本文件
#根据崔大庆视频整理
import requestsimport reimport jsonimport osfrom requests.exceptions import RequestExceptionfrom multiprocessing import Poolfrom bs4 import BeautifulSoupfrom urllib.parse import urlencodefrom json.decoder import JSONDecodeErrorheaders = {    'User-Agent': 'User-Agent  Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',    'Accept': 'application/json, text/javascript',    'Accept-Encoding': 'gzip, deflate',    'Accept-Language': 'zh-CN'}def get_page_index(offset,keyword):    data={        'offset':offset,        'format':'json',        'keyword':keyword,        'autoload':'true',        'count':20,        'cur_tab':1    }    url = 'https://www.toutiao.com/search_content/?'+urlencode(data)    try:        response = requests.get(url,headers=headers)        if response.status_code == 200:            return response.text        return  None    except RequestException:        print('请求索引页面错误')        return Nonedef parse_page_index(html):    try:        data = json.loads(html)        #print(data)        if data and 'data' in data.keys():            for item in data.get('data'):                yield item.get('article_url')    except JSONDecodeError:        passdef get_page_detail(url):    try:        response = requests.get(url,headers=headers)        if response.status_code == 200:            return response.text        return  None    except RequestException:        print('请求索引页面错误')        return None#获取详情页def parse_page_detail(html,url):    # 需要安装lxml包 安装32位的lxml‑4.0.0‑cp36‑cp36m‑win32.whl    # https://www.zhihu.com/question/49221958/answer/114914375    soup = BeautifulSoup(html,'lxml')    title = soup.select('title')[0].get_text()    # 正则获取html的js 内容    img_pattern = re.compile('var gallery = (.*?);',re.S)    result = re.search(img_pattern,html)    if result:        # 获取sub_images 列表        data = json.loads(result.group(1))        if data and 'sub_images' in data.keys():            sub_images = data.get('sub_images')            images = [item.get('url') for item in sub_images]            for image in images:down_image(image)            return {                'title':title,                'images':images,                'url':url            }def down_image(url):    print('正在下载图片',url)    names = re.split('/',url)    print('图片名称:',names[4])    try:        response = requests.get(url,headers=headers)        if response.status_code == 200:            #response.content -- 图片二进制            #response.text -- 文字            save_image(response.content,name=names[4])        return  None    except RequestException:        print('请求图片错误',url)        return None#下载 照片,需要在工程建img文件夹def save_image(content,name):    file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'img',name,'jpg')    print('图片路径',file_path)    if not os.path.exists(file_path):        with open(file_path,'wb') as f:            f.write(content)            f.close()#写入文本文件def writeToFile(content):    with open("toutiaojiepai.txt",'a',encoding='utf-8') as f:        f.write(json.dumps(content,ensure_ascii=False) + "\n")        f.close()def main(offset):    html = get_page_index(offset,'街拍')    #print(html)    for url in parse_page_index(html):        html = get_page_detail(url)        if html:            result = parse_page_detail(html,url)            #print(result)            writeToFile(result)if __name__ == '__main__':    #main()    groups = [x*20 for x in range(1,21)]    pool = Pool()    pool.map(main,groups)
原创粉丝点击