爬取博客详细页面的标题(python3.5以上,async/await,aiohttp)

来源:互联网 发布:阿里云香港主机 编辑:程序博客网 时间:2024/06/04 19:07

因为公司使用python2.x,自己没事儿学了一下python3.X觉得挺有意思的,于是写一下爬虫看看效率,哈哈哈........

aiohttp是一个第三方异步的http库,感觉还不错,主要是requests是阻塞的

代码很简单,还是以自己的博客为例子:

# -*-coding:utf-8-*-"""ayou"""from bs4 import BeautifulSoup as bsimport asyncioimport aiohttpimport time#async,协程对象async def getPage(url,res_list,callback=None):    print(url)    headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}    #asyncio.Semaphore(),限制同时运行协程数量    sem = asyncio.Semaphore(5)    with (await sem):        async with aiohttp.ClientSession() as session:            async with session.get(url,headers=headers) as resp:                #断言,判断网站状态                assert resp.status==200                #判断不同回调函数做处理                if callback==grabPage:                    body = await resp.text()                    callback(res_list,body)                elif callback==grabPage1:                    body = await resp.text()                    callback(body)                else:                    return await resp.text()                #关闭请求                session.close()#解析页面拿到博客urldef grabPage(res_list,body):    page = bs(body,”lxml")    articles = page.find_all('div', attrs={'class': 'article_title'})    for a in articles:        x = a.find('a')['href']        # print('http://blog.csdn.net' + x)        res_list.add('http://blog.csdn.net' + x)#拿到博客页面的标题def grabPage1(body):    page = bs(body,”lxml")    articles = page.find("title")    print(articles.text)start = time.time()#博客列表页面总页数page_num = 4#起始页面page_url_base = 'http://blog.csdn.net/u013055678/article/list/'#列表页面的列表page_urls = [page_url_base + str(i+1) for i in range(page_num)]#asyncio.get_event_loop(),创建事件循环loop = asyncio.get_event_loop()#用来储存所有博客详细页URLret_list = set()#协程任务,获得所有博客详细页面并存到set中tasks = [getPage(host,ret_list, callback=grabPage) for host in page_urls]#在事件循环中执行协程程序loop.run_until_complete(asyncio.gather(*tasks))#协程任务,获得博客详细页面的标题tasks = [getPage(url, ret_list, callback=grabPage1) for url in ret_list]#在事件循环中执行协程程序loop.run_until_complete(asyncio.gather(*tasks))#关闭事件循环loop.close()print("Elapsed Time: %s" % (time.time() - start))

用时:



对比之前博客中的多线程爬虫的速度差不多,比多进程慢一点!

async替代python3.4中的@asyncio.coroutine,用await替代yield from

async替代tornado中的@gen.coroutine,用await替代yield


用一下 redis:

# -*-coding:utf-8-*-"""ayou"""import redisclass redisTools(object):    def __init__(self, **key):        self.pool = redis.ConnectionPool(**key)        self.r = redis.StrictRedis(connection_pool=self.pool)    #可以存储任意格式    def setData(self,keyname,data):        data = self.r.set(keyname,data)        return data    #取数据    def getData(self,keyname,coding="utf-8"):        data = self.r.get(keyname)        data = data.decode(coding)        return data    # 取数据并删除    def getDataDel(self, keyname, coding="utf-8"):        data = self.r.get(keyname)        data = data.decode(coding)        #删除        self.r.delete(keyname)        return data    #只保存属性值,key对应多个属性    def setValue(self,keyname,data):        data = self.r.lpush(keyname, data)        return data    #取出属性值,并删除    def getValue(self,keyname,coding="utf-8"):        data = self.r.brpop(keyname, 0)[1]        data = data.decode(coding)        return data    #以键值对形式保存属性名和属性值,key对应多个属性    def setKeyValue(self,keyname,datakey, data):        state = self.r.hset(keyname, datakey, data)        if state==0:            return True        else:            return False    # 取出属性值    def getKeyValue(self,keyname, datakey,coding="utf-8"):        data = self.r.hget(keyname, datakey)        data = data.decode(coding)        return data    # 取出属性值并删除    def getKeyValueDel(self,keyname, datakey,coding="utf-8"):        data = self.r.hget(keyname, datakey)        data = data.decode(coding)        #删除        self.r.hdel(keyname, datakey)        return data    #根据属性名删属性值    def delAttribute(self,keyname,datakey):        hdel = self.r.hdel(keyname,datakey)        if hdel==1:            return True        else:            return False    #获得key下面所有属性名    def getKeyAllAttribute(self,keyname):        hkeys = self.r.hkeys(keyname)        return hkeys    #获得所有key的名称    def getKey(self):        keys = self.r.keys()        return keys    #获得同一个key还有多少    def getLen(self,keyname):        llen = self.r.llen(keyname)        return llen    #判断key是否存在    def getExists(self,keyname):        exists = self.r.exists(keyname)        return exists    #获得key的数量    def getDbsize(self):        dbsize = self.r.dbsize()        return dbsize    #删除key    def deleteKy(self,keyname):        delete = self.r.delete(keyname)        if delete==1:            return True        else:            return False    # 删除当前数据库的所有数据    def flushDB(self):        flushdb = self.r.flushdb()        return flushdb    #======集合==========    #添加数据,因为是集合所以有去重功能,返回添加了多少    def setSets(self,keyname,*data):        return self.r.sadd(keyname,*data)    #取出集合,如果只有一个返回字符串,如果多个返回列表    def getSets(self,keyname, coding="utf-8"):        data = self.r.smembers(keyname)        if len(data) == 1:            return list(data)[0].decode(coding)        else:            data = [d.decode(coding) for d in data]            return data    # 取出集合,如果只有一个返回字符串,如果多个返回列表,最后删除    def getSetsDel(self,keyname, coding="utf-8"):        data = self.r.smembers(keyname)        if len(data) == 1:            data = list(data)[0].decode(coding)            self.r.srem(keyname, data)            return data        else:            data = [d.decode(coding) for d in data]            [self.r.srem(keyname, d) for d in data]            return data    #删除集合的元素,返回删除了多少    def setsDel(self,keyname,*data):        return self.r.srem(keyname, data)    #判断元素是否存在    def isExist(self, keyname, data):        return self.r.sismember(keyname, data)    #集合长度    def setsLen(self , keyname):        return self.r.scard(keyname)    #多个集合的交集,返回列表    def setsIntersection(self, *keyname):        data = self.r.sinter(keyname)        data = [d.decode("utf-8") for d in data]        return data    #多个集合的并集,返回列表    def setsAndSet(self,*keyname):        data = self.r.sunion(keyname)        data = [d.decode("utf-8") for d in data]        return data

使用redis的爬虫:

# -*-coding:utf-8-*-"""ayou"""from bs4 import BeautifulSoup as bsimport asyncioimport aiohttpimport timefrom redisTools import redisTools#async,协程对象async def getPage(url,res_list,body_list,callback=None):    print(url)    headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}    #asyncio.Semaphore(),限制同时运行协程数量    sem = asyncio.Semaphore(5)    with (await sem):        async with aiohttp.ClientSession() as session:            async with session.get(url,headers=headers) as resp:                #断言,判断网站状态                assert resp.status==200                #判断不同回调函数做处理                if callback==grabPage:                    body = await resp.text()                    body_list.setSets("bodylist",body)                    callback(res_list)                elif callback==grabPage1:                    body = await resp.text()                    body_list.setSets("bodyxx", body)                    callback(body_list)                else:                    return await resp.text()                #关闭请求                session.close()#解析页面拿到博客urldef grabPage(res_list):    body = res_list.getSetsDel("bodylist")    # print(type(body))    page = bs(body,"lxml")    articles = page.find_all('div', attrs={'class': 'article_title'})    for a in articles:        x = a.find('a')['href']        # print('http://blog.csdn.net' + x)        res_list.setSets("xxurl",'http://blog.csdn.net' + x)#拿到博客页面的标题def grabPage1(res_list):    body = res_list.getSetsDel("bodyxx")    # print(body)    page = bs(body,"lxml")    articles = page.find("title")    print(articles.text)    res_list.setSets("title",articles.text)start = time.time()#博客列表页面总页数page_num = 4#起始页面page_url_base = 'http://blog.csdn.net/u013055678/article/list/'#列表页面的列表page_urls = [page_url_base + str(i+1) for i in range(page_num)]#asyncio.get_event_loop(),创建事件循环loop = asyncio.get_event_loop()#用来储存所有博客详细页URLrt = redisTools()#协程任务,获得所有博客详细页面并存到set中tasks = [getPage(host,rt, rt,callback=grabPage) for host in page_urls]#在事件循环中执行协程程序loop.run_until_complete(asyncio.gather(*tasks))ret_list = rt.getSetsDel("xxurl")#协程任务,获得博客详细页面的标题tasks = [getPage(url, ret_list,rt, callback=grabPage1) for url in ret_list]#在事件循环中执行协程程序loop.run_until_complete(asyncio.gather(*tasks))#关闭事件循环loop.close()print("Elapsed Time: %s" % (time.time() - start))





0 0
原创粉丝点击