百度贴吧爬虫 2017 -11 -9版 python3.x
来源:互联网 发布:linux rpm安装命令 编辑:程序博客网 时间:2024/05/18 08:01
学于:http://cuiqingcai.com/993.html
#-*-coding?utf-8 -*-import urllib.requestimport urllib.parseimport reclass Tool: #去除img标签,7位长空格 removeImg = re.compile('<img.*?>| {7}|') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #将表格制表<td>替换为\t replaceTD= re.compile('<td>') #把段落开头换为\n加空两格 replacePara = re.compile('<p.*?>') #将换行符或双换行符替换为\n replaceBR = re.compile('<br><br>|<br>') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n ",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) #strip()将前后多余内容删除 return x.strip()class BDTB: def __init__(self,baseUrl,seeLz,floorTag): self.baseURL = baseUrl self.seeLz = '?see_lz='+str(seeLz) self.tool = Tool() self.file = None self.floor = 1 self.defaultTitle = u"百度贴吧" self.floorTag = floorTag def getPage(self,pageNum): try: url = self.baseURL+self.seeLz+'&pn='+str(pageNum) request = urllib.request.Request(url) response = urllib.request.urlopen(request) #print(response.read()) return response.read().decode('utf-8') except urllib.request.URLError as e: if hasattr(e,'reason'): print('link BDTB error') return None def getTitle(self,page): pattern = re.compile('<h3 class="core_title_txt.*?title="(.*?)" style',re.S) result = re.search(pattern,page) if result: print(result.group(1)) return result.group(1).strip() else: return None def getNum(self,page): pattern = re.compile('<li class="l_reply_num".*?margin-right:3px">.*?</span>.*?<span class="red">(.*?)</span>',re.S) result = re.search(pattern , page) if result: print(result.group(1)) return result.group(1).strip() else: return None def getContent(self,page): pattern = re.compile('<cc>.*?<div id=.*?j_d_post_content ">(.*?)</div>',re.S) result = re.findall(pattern,page) contents = [] if result: for line in result: temp = "\n" + self.tool.replace(line.strip()) + "\n" print(self.tool.replace(line.strip())) contents.append(temp) return contents else: return None def setFileTitle(self,title): if title is not None: self.file = open(title + ".txt" , "w+") else: self.file = open(self.defaultTitle+".txt", "w+") def writeData(self,contents): for item in contents: if self.floorTag == '1': floorLine = "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------\n" self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNum =self.getNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNum == None: print("找不到") return try: print("该帖子共有"+str(pageNum)+"页") for i in range(1,int(pageNum)+1): print("正在写入第" + str(i) + "页数据") page = self.getPage(i) contents =self.getContent(page) self.writeData(contents) except IOError as e: print("异常") finally: print("下载结束")print("请输入帖子代号:")baseURL = 'http://tieba.baidu.com/p/'+str(input(u'http://tieba.baidu.com/p/'))seeLz = input("是否只获取楼主发言,是输入1,否输入0\n")floorTag = input("是否写入楼层信息,是输入1,否输入0\n")dbtb = BDTB(baseURL,seeLz,floorTag)dbtb.start()
阅读全文
0 0
- 百度贴吧爬虫 2017 -11 -9版 python3.x
- python3.x百度贴吧图片爬虫(附知乎图片爬虫)
- python3百度贴吧爬虫案例
- Python3 简单爬虫爬取百度贴吧帖子
- Python3.x爬虫
- python3.4 百度贴吧小爬虫
- 第一个Python3.0 爬虫程序, 爬取百度贴吧图片
- python3 多页爬虫爬取百度贴吧帖子所有图片
- [python3]爬虫实战二之爬取百度贴吧帖子
- python3.X爬虫-图片获取
- Python爬虫(1),Python3.x
- Python爬虫(2),Python3.x
- Python爬虫(3),Python3.x
- python3.x之爬虫学习
- 百度贴吧爬虫
- 百度贴吧爬虫
- Python3爬虫百度图片搜索的图片
- Python3 百度图片 美女 下载 爬虫 多线程
- NOIP注意事项
- GMM混合高斯模型的EM算法及Python实现
- Python 漫游记
- vue学习-指令
- 知识点
- 百度贴吧爬虫 2017 -11 -9版 python3.x
- mybatis使用foreach批次插入,解决sequence只查询一次的问题(在此,我只看union all 部分)
- 大唐天下商城系统开发
- HDU 5718 Oracle(大数)
- 12cR2 下手工建库 CDB
- 数论[模板]
- 二叉搜索树的操作集(30 分)
- 线程池
- 数据结构单链表应用