python 抓取头条街拍图片
来源:互联网 发布:建模软件哪个好知乎 编辑:程序博客网 时间:2024/05/22 10:31
#抓取头条图片,存入文本文件
#根据崔大庆视频整理
import requestsimport reimport jsonimport osfrom requests.exceptions import RequestExceptionfrom multiprocessing import Poolfrom bs4 import BeautifulSoupfrom urllib.parse import urlencodefrom json.decoder import JSONDecodeErrorheaders = { 'User-Agent': 'User-Agent Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Accept': 'application/json, text/javascript', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN'}def get_page_index(offset,keyword): data={ 'offset':offset, 'format':'json', 'keyword':keyword, 'autoload':'true', 'count':20, 'cur_tab':1 } url = 'https://www.toutiao.com/search_content/?'+urlencode(data) try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求索引页面错误') return Nonedef parse_page_index(html): try: data = json.loads(html) #print(data) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError: passdef get_page_detail(url): try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求索引页面错误') return None#获取详情页def parse_page_detail(html,url): # 需要安装lxml包 安装32位的lxml‑4.0.0‑cp36‑cp36m‑win32.whl # https://www.zhihu.com/question/49221958/answer/114914375 soup = BeautifulSoup(html,'lxml') title = soup.select('title')[0].get_text() # 正则获取html的js 内容 img_pattern = re.compile('var gallery = (.*?);',re.S) result = re.search(img_pattern,html) if result: # 获取sub_images 列表 data = json.loads(result.group(1)) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images:down_image(image) return { 'title':title, 'images':images, 'url':url }def down_image(url): print('正在下载图片',url) names = re.split('/',url) print('图片名称:',names[4]) try: response = requests.get(url,headers=headers) if response.status_code == 200: #response.content -- 图片二进制 #response.text -- 文字 save_image(response.content,name=names[4]) return None except RequestException: print('请求图片错误',url) return None#下载 照片,需要在工程建img文件夹def save_image(content,name): file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'img',name,'jpg') print('图片路径',file_path) if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(content) f.close()#写入文本文件def writeToFile(content): with open("toutiaojiepai.txt",'a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) + "\n") f.close()def main(offset): html = get_page_index(offset,'街拍') #print(html) for url in parse_page_index(html): html = get_page_detail(url) if html: result = parse_page_detail(html,url) #print(result) writeToFile(result)if __name__ == '__main__': #main() groups = [x*20 for x in range(1,21)] pool = Pool() pool.map(main,groups)
阅读全文
0 0
- python 抓取头条街拍图片
- python抓取网页图片
- Python抓取图片
- python图片抓取
- python爬虫抓取图片
- python抓取网页图片
- python抓取百度图片
- Python抓取图片Demo
- Python 抓取图片示例
- Python多线程抓取图片
- python抓取图片示例
- python美女图片抓取
- Python抓取网页图片
- 利用Python抓取图片
- python 图片抓取
- python抓取网页图片
- python 抓取图片
- python 抓取图片
- 关于反射,动态加载,静态加载
- 微信小程序要完败APP了?
- 利用ajax给html动态拼接html代码
- linux常用命令
- 从未停止进化的MaxCompute
- python 抓取头条街拍图片
- 见招拆招 八大物联网安全关键技术盘点
- react写城市(省市区)联动
- 还在抱怨4G太贵? 别急,5G又要涨价了
- 路径
- PHP严格模式 Strict standards: Declaration of xxxxxx should be compatible with yyyyyy
- hdu 4099
- 一文掌握物联网开发技能树
- Linux服务器安全策略实战