python爬虫爬取糗百成人图片单线程版本

来源:互联网 发布:阿里云ecs 数据库管理 编辑:程序博客网 时间:2024/04/30 08:49

爬取糗百成人图片的所有图图,先来个单线程版本的:

#encoding:utf-8#单线程简单版本import requestsimport osfrom bs4 import BeautifulSoupimport threadingimport urllib.requestFIRST_PAGE_URL = 'http://www.qiubaichengren.com/{}.html'PAGE_URL_LIST = []gLock = threading.Lock()for x in range(1,100):    page_url = FIRST_PAGE_URL.format(x)    PAGE_URL_LIST.append(page_url)def get_page(page_url):    response = requests.get(page_url)    content = response.content    soup = BeautifulSoup(content, 'lxml')    src = soup.find_all('div',class_='mala-text')    imgs = soup.find_all('img')    for img in src:        url = img.find('img')        link = url.get('src')        title = url.get('alt')        split_list = link.split('/')        final = split_list.pop()        t_split_list = final.split('.')        suffix = t_split_list.pop()        filename = title + '.' + suffix #名字加后缀       # print(filename,link)        print(link)        download_image(link,filename)def download_image(url,filename):    path = os.path.join('images', filename)    urllib.request.urlretrieve(url, filename=path)        #print(link)   # print(imgs)   # title = soup.select('#wrapper > div > div.ui-main > div.mala-text > div.mtitle > a')    #imgs = soup.select('img')    #print(imgs)    # for img in imgs:    #     tmp   = img.get('src')    #     print('-----')    #     title = img.attrs["alt"]    #     print(title)    ##get_page('http://www.qiubaichengren.com/1.html')def main():    #get_page('http://www.qiubaichengren.com/1.html')    for pageurl in PAGE_URL_LIST:        print(pageurl)        get_page(pageurl)if __name__ == "__main__":    main()


原创粉丝点击