爬取百度贴吧所有精品贴照片

来源:互联网 发布:同安教育网络平台oa 编辑:程序博客网 时间:2024/04/29 01:59
#coding=utf-8import osimport randomimport requestsfrom lxml import etreefrom urllib.parse import urlparseimport urllib.request as urllibfrom bs4 import BeautifulSoupuser_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",              "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",                "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11",               "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6",               "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6",              "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "                "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5",               "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",                "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5",               "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "                "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",                "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3",                "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "                "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",               "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3",               "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",              "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3",                "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3",               "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3",               "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24",                "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",               "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",] class Crawler(object):    def __init__(self,start_url):        self.index = 1        self.tag = 0        self.tagname = []        self.start_url = start_url        self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url))            @staticmethod    def request(url, **kwargs):        try:            page = requests.get(url,**kwargs)            return page.text        except:            return ''            def get_max_page(self,html,regular):        html = etree.HTML(html)        pages = html.xpath(regular)        try:            max_page,*_ = pages        except:            max_page = '1'        return max_page    def get_good_pages(self,html):        '''        获取所有精品帖子的页面,类似        https://tieba.baidu.com/f?kw=图片&ie=utf-8&tab=good&cid=&pn=50                '''        max_page = self.get_max_page(html,'//a[@class="last"]//@href')        if max_page != '1':            max_page = max_page.split('=')[-1]        max_page = int(max_page) + 1        for i in range(0,max_page,50):            yield (self.start_url.split('&')[0] + '&ie=utf-8&cid=0&pn={}'.format(i))                def get_good_urls(self,html):        '''        获取精品帖子里面内的所有精品贴url,类似        https://tieba.baidu.com/p/3868212854        '''        pages = self.get_good_pages(html)        for page in pages:            html = self.request(page)            html = etree.HTML(html)            self.tagname.extend(html.xpath('//a[@class = "j_th_tit"]//@title'))            urls = html.xpath('//a[@class = "j_th_tit"]//@href')            for url in urls:                url = url.split('?')[0]                url = self.domain + url                yield url                self.tag += 1                self.index = 1                    def get_post_urls(self,url,max_page):        '''        获取所有精品帖子的url,类似        https://tieba.baidu.com/p/3868212854?pn=2        '''        for i in range(1,max_page + 1):            yield (url + '?pn={}'.format(i))    def get_single_urls(self,html):        urls = self.get_good_urls(html)        for url in urls:            html = self.request(url)            max_page = self.get_max_page(html,'//li[@class = "l_reply_num"]//@max-page')            max_page = int(max_page) + 1            single_urls = self.get_post_urls(url,max_page)            yield single_urls    def get_imgs(self,html):        '''       下载某个精品帖子的所有图片       '''        jpgdir = r'D:\pic\{}'.format( self.tagname[self.tag])        if not os.path.exists(jpgdir):            os.makedirs(jpgdir)        html = etree.HTML(html)        img_urls = html.xpath('//img[@class = "BDE_Image" and @width > "400"]//@src')        for img in img_urls:            print ("正在下载第{}张图片".format(self.index))            urllib.urlretrieve(img,r'{}\{}.jpg'.format(jpgdir,self.index))            self.index += 1             def run(self,html):        single_urls = self.get_single_urls(html)        for single_url in single_urls:            for url in single_url:                User_Agent = random.choice(user_agent_list) #伪装一下                headers = {'User-Agent':User_Agent}                  html = self.request(url,headers = headers)                self.get_imgs(html)           if __name__=='__main__':    post_bar = input("请输入贴吧名称:")    start_url = 'https://tieba.baidu.com/f/good?kw={}&ie=utf-8&cid=0&pn=0'.format(post_bar)    crawler = Crawler(start_url)    html = crawler.request(start_url)    if '本吧暂不开放' in html:        print ("抱歉,根据相关法律法规和政策,本吧暂不开放。")    elif 'page404' in html:        print ('很抱歉,您要访问的页面不存在。')    else:        print("开始爬取{}吧所有精品帖子图片".format(post_bar))        crawler.run(html)

原创粉丝点击