python爬虫爬取糗百成人图片多线程版本

来源:互联网 发布:ubuntu redis 绑定ip 编辑:程序博客网 时间:2024/04/30 15:45

来个简单的多线程,爬取速度比单线程要快很多,下面上代码:

#encoding:utf-8#多线程爬取import requestsimport osfrom bs4 import BeautifulSoupimport threadingimport urllib.requestFIRST_PAGE_URL = 'http://www.qiubaichengren.com/{}.html'PAGE_URL_LIST = []IMG_URL_LIST = [] #所有的图片链接NAME_LIST = []gLock = threading.Lock()for x in range(1,100):    page_url = FIRST_PAGE_URL.format(x)    PAGE_URL_LIST.append(page_url)def get_page():    while True:        gLock.acquire()        if len(PAGE_URL_LIST) == 0:            gLock.release()            break        else:            page_url = PAGE_URL_LIST.pop()            gLock.release()            response = requests.get(page_url)            content = response.content            soup = BeautifulSoup(content, 'lxml')            src = soup.find_all('div', class_='mala-text')            imgs = soup.find_all('img')            for img in src:                url = img.find('img')                link = url.get('src')                title = url.get('alt')                split_list = link.split('/')                final = split_list.pop()                t_split_list = final.split('.')                suffix = t_split_list.pop()                filename = title + '.' + suffix  # 名字加后缀                gLock.acquire()                NAME_LIST.append(filename)                IMG_URL_LIST.append(link)                gLock.release()                #download_image(link, filename)def download_image():    while True:        gLock.acquire()        if len(IMG_URL_LIST) == 0:            gLock.release()            continue        else:            url = IMG_URL_LIST.pop()            filename = NAME_LIST.pop()            gLock.release()            path = os.path.join('images', filename)            urllib.request.urlretrieve(url, filename=path)def main():    for x in range(4):        th = threading.Thread(target=get_page)        th.start()    for x in range(5):        th = threading.Thread(target=download_image)        th.start()if __name__ == "__main__":    main()


原创粉丝点击