多线程队列爬虫

来源:互联网 发布:linux tail 最后100行 编辑:程序博客网 时间:2024/06/13 23:43

爬取豆瓣电影

思路

这里写图片描述

实现

    # 1 导入模块    from queue import Queue    from threading import Thread    import requests    from lxml import etree    import pymongo    import json
    # 2 创建队列    url_pool = Queue()     # 初始url队列    res_pool = Queue()     # 初始url响应队列    detail_pool = Queue()  # 详情页面url队列    dres_pool = Queue()    # 详情页面响应队列    save_pool = Queue()    # 保存进数据库队列
    # 3 初始url    class Init_Url(Thread):        def __init__(self, name):            Thread.__init__(self)            self.start_url = 'https://m.douban.com/rexxar/api/v2/subject_collection/' \        'filter_movie_score_hot/items?os=ios&' \        'for_mobile=1&start={}&count=50&' \        'loc_id=108288&_=1504077041884'            self.name = name    # 产生url,放入队列    def run(self):        url = self.start_url.format(str(self.name))        url_pool.put(url)
    # 4 初始url响应解析    class Ires(Thread):        def __init__(self):            Thread.__init__(self)            self.headers = \                {                    'Host': 'm.douban.com',                    'User-Agent': '随便一个手机User-Agent头',                    'Cookie': '本地cookies',                }        def run(self):            while True:                # 从初始url队列中取出一个url                url = url_pool.get()                # 结束条件(后面所有的线程队列基本一样)                if url != 'end' and url:                    try:                        # 发起请求,获取响应                        res = requests.get(url, headers=self.headers)                        # 响应放入响应队列                        res_pool.put(res.text)                    except Exception as e:                        print(e)                        break                else:                    print('url放入完成')                    break
    # 5 初始url响应内容解析    class Detail(Thread):        def __init__(self):            Thread.__init__(self)        def run(self):            while True:                # 取出一个响应                res = res_pool.get()                # 判 断结束条件                if res != 'end' and res:                    try:                        # 解析响应json数据                        response = json.loads(res)                        # 获取url列表                        item_list = response['subject_collection_items']                        # 循环获取每一个详情页面URL                        for item in item_list:                            detail_url = 'https://m.douban.com/movie/subject/' + item['id']                            # 放入详情URL队列                            detail_pool.put(detail_url)                    except Exception as e:                        print(e)                        break                else:                    print('详情url放入完成')                    break
    # 6 取出详情页面url,发起请求获取响应(情况一样)    class Dreq(Thread):        def __init__(self):            Thread.__init__(self)        def run(self):            while True:                detail_url = detail_pool.get()                if detail_url != 'end' and detail_url:                    try:                        res = requests.get(detail_url)                        response = etree.HTML(res.content)                        dres_pool.put(response)                    except Exception as e:                        print(e)                        break                else:                    print('详情页面请求完成')                    break
    # 7 解析内容,放入队列    class Dres(Thread):        def __init__(self):            Thread.__init__(self)        def run(self):            while True:                response = dres_pool.get()                if response != 'end' and response:                    try:                        # 电影名                        name = ''.join(response.xpath(r"//div[@class='card']/h1/text()"))                        # 评分                        score = ''.join(response.xpath(r"//p[@class='rating']/strong/text()"))                        # 评价人数                        number = ''.join(response.xpath(r"//p[@class='rating']/span/text()"))                        # 演员等信息                        meta = ''.join(response.xpath(r"//p[@class='meta']/text()")).strip()                        # 简介,概要                        synopsis = ''.join(response.xpath(r"//div[@class='bd']/p/text()")).strip()                        # 推荐电影url                        link_list =  response.xpath(r"/html/body/div[2]/div[1]/section[8]/div/ul/li/a/@href")                        # 推荐电影名                        movie_list = response.xpath(r"//div[@class='wp']/h3/text()")                        if link_list and movie_list:                            link_list = ['https://m.douban.com'+link for link in link_list]                            link_movie = dict(zip(movie_list, link_list))                        else:                            link_movie = None                        # 保存数据到mongodb数据库                        if link_movie:                            save_pool.put({name:[score,number,meta,synopsis,link_movie]})                        else:                            save_pool.put({name:[score,number,meta,synopsis]})                    except Exception as e:                        print(e)                        break                else:                    print('解析内容完成')                    break
    # 8 保存进数据库    class Save(Thread):        def __init__(self):            Thread.__init__(self)        def run(self):            # 建立数据库连接(默认本地连接)            client = pymongo.MongoClient()            # 获取数据库            db = client['douban']            # 获取集合            collection = db['movie']            while True:                result = save_pool.get()                if result != 'end' and result:                    try:                        # 保存                        collection.insert(result)                        print('保存了一条数据')                    except Exception as e:                        print(e)                        break                else:                    print('数据库保存完成')                    break
    # 9 主函数    def main():        # 创建初始url池        for i in range(0,501,50):            a = Init_Url(i)            a.setDaemon(True)            a.start()        # 判断线程a是否结束        if not a.is_alive():            url_pool.put('end')        # 取出初始url,发起请求,获取响应        for i in range(5):            b = Ires()            b.setDaemon(True)            b.start()        # 判断线程b是否结束        if not b.is_alive():            res_pool.put('end')        # 解析响应,获取详情页面url        for i in range(5):            c = Detail()            c.setDaemon(True)            c.start()        # 判断线程c是否结束        if not c.is_alive():            detail_pool.put('end')        # 解析详情页面url,获取内容        for i in range(5):            d = Dreq()            d.setDaemon(True)            d.start()        # 判断线程d是否结束        if not d.is_alive():            dres_pool.put('end')        # 解析详情页面内容        for i in range(5):            e = Dres()            e.setDaemon(True)            e.start()        # 判断线程e是否结束        if not e.is_alive():            save_pool.put('end')        # 保存进数据库        for i in range(5):            f = Save()            f.setDaemon(True)            f.start()
    # 10 执行函数    if __name__ == '__main__':        main()

结论

基本都是重复,一存一取,并设置结束条件,阻塞等待注意点:    MongoDB 存储数据的 key不能包含'.',所以有可能电影名包含'.'的存储会失败        
原创粉丝点击