爬取百度贴吧所有精品贴照片
来源:互联网 发布:同安教育网络平台oa 编辑:程序博客网 时间:2024/04/29 01:59
#coding=utf-8import osimport randomimport requestsfrom lxml import etreefrom urllib.parse import urlparseimport urllib.request as urllibfrom bs4 import BeautifulSoupuser_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ", "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11", "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6", "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6", "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5", "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5", "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3", "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24", "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ", "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",] class Crawler(object): def __init__(self,start_url): self.index = 1 self.tag = 0 self.tagname = [] self.start_url = start_url self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url)) @staticmethod def request(url, **kwargs): try: page = requests.get(url,**kwargs) return page.text except: return '' def get_max_page(self,html,regular): html = etree.HTML(html) pages = html.xpath(regular) try: max_page,*_ = pages except: max_page = '1' return max_page def get_good_pages(self,html): ''' 获取所有精品帖子的页面,类似 https://tieba.baidu.com/f?kw=图片&ie=utf-8&tab=good&cid=&pn=50 ''' max_page = self.get_max_page(html,'//a[@class="last"]//@href') if max_page != '1': max_page = max_page.split('=')[-1] max_page = int(max_page) + 1 for i in range(0,max_page,50): yield (self.start_url.split('&')[0] + '&ie=utf-8&cid=0&pn={}'.format(i)) def get_good_urls(self,html): ''' 获取精品帖子里面内的所有精品贴url,类似 https://tieba.baidu.com/p/3868212854 ''' pages = self.get_good_pages(html) for page in pages: html = self.request(page) html = etree.HTML(html) self.tagname.extend(html.xpath('//a[@class = "j_th_tit"]//@title')) urls = html.xpath('//a[@class = "j_th_tit"]//@href') for url in urls: url = url.split('?')[0] url = self.domain + url yield url self.tag += 1 self.index = 1 def get_post_urls(self,url,max_page): ''' 获取所有精品帖子的url,类似 https://tieba.baidu.com/p/3868212854?pn=2 ''' for i in range(1,max_page + 1): yield (url + '?pn={}'.format(i)) def get_single_urls(self,html): urls = self.get_good_urls(html) for url in urls: html = self.request(url) max_page = self.get_max_page(html,'//li[@class = "l_reply_num"]//@max-page') max_page = int(max_page) + 1 single_urls = self.get_post_urls(url,max_page) yield single_urls def get_imgs(self,html): ''' 下载某个精品帖子的所有图片 ''' jpgdir = r'D:\pic\{}'.format( self.tagname[self.tag]) if not os.path.exists(jpgdir): os.makedirs(jpgdir) html = etree.HTML(html) img_urls = html.xpath('//img[@class = "BDE_Image" and @width > "400"]//@src') for img in img_urls: print ("正在下载第{}张图片".format(self.index)) urllib.urlretrieve(img,r'{}\{}.jpg'.format(jpgdir,self.index)) self.index += 1 def run(self,html): single_urls = self.get_single_urls(html) for single_url in single_urls: for url in single_url: User_Agent = random.choice(user_agent_list) #伪装一下 headers = {'User-Agent':User_Agent} html = self.request(url,headers = headers) self.get_imgs(html) if __name__=='__main__': post_bar = input("请输入贴吧名称:") start_url = 'https://tieba.baidu.com/f/good?kw={}&ie=utf-8&cid=0&pn=0'.format(post_bar) crawler = Crawler(start_url) html = crawler.request(start_url) if '本吧暂不开放' in html: print ("抱歉,根据相关法律法规和政策,本吧暂不开放。") elif 'page404' in html: print ('很抱歉,您要访问的页面不存在。') else: print("开始爬取{}吧所有精品帖子图片".format(post_bar)) crawler.run(html)
阅读全文
0 0
- 爬取百度贴吧所有精品贴照片
- 爬取百度贴吧照片
- python爬虫爬取NBA贴吧的所有精品贴
- 爬取百度贴吧某帖子的所有照片
- python3 多页爬虫爬取百度贴吧帖子所有图片
- 爬取百度贴吧帖子
- python爬取百度贴吧图片
- python爬取百度贴吧
- 爬取整个百度贴吧
- python爬取百度贴吧小说
- python爬取百度贴吧小说
- Python爬取百度贴吧帖子
- python:爬取百度贴吧图片
- Python爬取百度贴吧数据
- 爬取百度贴吧图片
- python 爬取百度贴吧 帖子
- 利用Python3爬取百度贴吧
- BeautifulSoup简单爬取百度贴吧
- html5网页布局
- javascript关于数组的几个方法
- IjkPlayer的使用
- Spring 拦截器
- Day1T1
- 爬取百度贴吧所有精品贴照片
- 【备忘】Java菜鸟到大牛学习路线培训教程
- Centos7安装Mysql
- 汉明距离
- 什么是P问题、NP问题和NPC问题
- Android IjkPlayer播放视屏
- 今日主力净流入十大个股20171116
- python3之opencv安装
- 模拟实现strlen函数