【Python】asyncio异步爬虫
来源:互联网 发布:java性能优化 编辑:程序博客网 时间:2024/06/05 03:56
抓取图片URL
# -*- coding: utf-8 -*-import asyncioimport aiohttpfrom pyquery import PyQuery as pqurls = ["http://www.mzitu.com/page/{}/".format(i) for i in range(1, 157)]# 限制并发数为5个semaphore = asyncio.Semaphore(5)f = open("img_url.txt", "w")async def get_html(url): ck = """Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1507966069,1509850072,1509851337,1509851651; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1509851653""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36", "Referer": url, "Cookie": ck, "Host": "www.mzitu.com" } async with semaphore: async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers) as html: # print(resp.status) response = await html.text(encoding="utf-8") # print(response) return responseasync def parse(url): html = await get_html(url) doc = pq(html) img_urls = doc(".postlist ul li a img").items() for img_url in img_urls: url = img_url.attr("data-original") f.write(url + '\n')loop = asyncio.get_event_loop()tasks = [parse(url) for url in urls]loop.run_until_complete(asyncio.wait(tasks))loop.close()f.close()
下载图片
# -*- coding: utf-8 -*-import asyncioimport aiohttpimport aiofilesimport timeurls = []with open("img_url.txt", "r") as f: for line in f: urls.append(line.strip())# print(urls)# 限制并发数为5个semaphore = asyncio.Semaphore(5)async def download(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36", "Referer": 'http://i.meizitu.net' } async with semaphore: async with aiohttp.ClientSession() as session: async with session.get(url, headers=headers) as html: print(html.status) img = await html.read() fname = str(time.time()) + '.jpg' fp = await aiofiles.open(fname, 'wb') await fp.write(img) return Trueloop = asyncio.get_event_loop()tasks = [download(url) for url in urls]loop.run_until_complete(asyncio.wait(tasks))loop.close()
阅读全文
0 0
- 【Python】asyncio异步爬虫
- 【asyncio】python异步IO
- python的异步IO模块asyncio学习
- python的异步IO(asyncio aiohttp)
- Python之异步IO( asyncio) 协程
- Python基础-异步任务IO-asyncio
- python asyncio
- Python学习:异步IO:协程和asyncio
- Python黑魔法 --- 异步IO( asyncio) 协程
- python异步爬虫
- Python 异步网络爬虫
- Python asyncio文档翻译--asyncio.Future
- Python标准模块--asyncio
- Python asyncio文档翻译
- python asyncio学习记录
- Python -asyncio笔记
- python asyncio模块
- Python asyncio使用范例
- cocos2dx 持续学习(三) 场景切换、弹出对话框
- TCP/IP 详解:arp 学习笔记
- 习题5.5 5.6
- Python的Cookie详解
- Python 如何用列表实现栈和队列
- 【Python】asyncio异步爬虫
- HSTS 网站http跳转到https
- 消息队列技术之基本概念
- 二进制位运算
- x$bh找到buffer cache属于哪个pool
- 2017年 代做安卓毕业设计 Android毕业设计
- 第五届电气学院比赛之XXX——整体设计
- Exception 异常
- magento 开发 -- 入门深入理解第六章 – 高级Magento模型