关于asyncio的ValueError: too many file descriptors in select()错误

来源:互联网 发布:华为手机全球销量知乎 编辑:程序博客网 时间:2024/05/17 01:28

最近写爬虫用asyncio+aiohttp的形式,代码如下:

import aiohttpimport asyncioheaders = {        "Upgrade-Insecure-Requests": "1",        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",        "Accept-Encoding": "gzip, deflate, sdch, br",        "Accept-Language": "zh-CN,zh;q=0.8",    }async def ss(url):    async with aiohttp.ClientSession() as session:        async with session.get(url,headers=headers) as resp:            print(resp.status)            d = (await resp.text("utf-8","ignore"))            cc(d)def cc(v):    print(v)    soup = BeautifulSoup(v, "lxml")    contents = soup.select("div.content")    for conten in contents:        articleAuthor = conten.select("div.blog_info > a")        if articleAuthor:            # print(articleAuthor)            articleAuthor = articleAuthor[0]        else:            articleAuthor = ""        print(articleAuthor)loop = asyncio.get_event_loop()tasks = [ss(url) for url in ["http://www.iteye.com/blogs/tag/java?page="+str(x) for x in range(1,2)] ]loop.run_until_complete(asyncio.gather(*tasks))

乍一看代码没有问题,运行起来代码也没有问题,但是如果将url增加到上千个就会报ValueError: too many file descriptors in select()的错误

这是为什么呢?

因为asyncio内部用到了select,而select就是那个什么系统打开文件数是有限度的,上面的代码一次性将处理url的函数作为任务扔进了一个超大的List中,这就引起了错误,用这种形式无法写大规模爬虫

那怎么办呢?

用回调

代码如下:

from bs4 import BeautifulSoupimport aiohttpimport asyncioimport timeurlss=[]headers = {        "Upgrade-Insecure-Requests": "1",        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",        "Accept-Encoding": "gzip, deflate, sdch, br",        "Accept-Language": "zh-CN,zh;q=0.8",    }async def ss(url):    async with aiohttp.ClientSession() as session:        async with session.get(url,headers=headers) as resp:            print(resp.status)            return await resp.text("utf-8","ignore")def cc(v):    print("ssssssss")    # print(v.result())    # result()获取内容    soup = BeautifulSoup(v.result(), "lxml")    contents = soup.select("div.content")    for conten in contents:        # articleAuthor = conten.select("div.blog_info > a")        # if articleAuthor:        #     # print(articleAuthor)        #     articleAuthor = articleAuthor[0]        # else:        #     articleAuthor = ""        articleUrl = conten.select("h3 > a")        if articleUrl:            articleUrl = articleUrl[0].get("href")            urlss.append(articleUrl)# async def ss2(url):#     async with aiohttp.ClientSession() as session:#         async with session.get(url,headers=headers) as resp:#             print(resp.status)#             return await resp.text("utf-8","ignore")def cc2(v):    print("ssssssss222222222222")    # print(v.result())    # result()获取内容    soup = BeautifulSoup(v.result(), "lxml")    articleImages_list = soup.select("img")    if articleImages_list:        articleImages_list = articleImages_list[0].get("src")    else:        articleImages_list = []    print(articleImages_list)now = lambda: time.time()start = now()loop = asyncio.get_event_loop()# url = "http://www.iteye.com/blogs/tag/java?page=1"for url in ["http://www.iteye.com/blogs/tag/java?page="+str(x) for x in range(1,2)]:    coroutine = ss(url)    # 添加任务    task = asyncio.ensure_future(coroutine)    # 回调    task.add_done_callback(cc)    # 事件循环    loop.run_until_complete(task)    for url in urlss:    coroutine = ss(url)    task = asyncio.ensure_future(coroutine)    task.add_done_callback(cc2)    loop.run_until_complete(task)print('TIME: ', now() - start)



原创粉丝点击