爬百度贴吧任一帖子

来源:互联网 发布:减肥时晚餐吃什么知乎 编辑:程序博客网 时间:2024/06/08 10:29

使用了urllib2和正则表达式

#coding=utf-8import urllibimport urllib2import reclass Tool:    removeImg=re.compile('<img.*?>| {7}|') #删除img标签,7位长空格    removeAddr=re.compile('<a.*?>|</a>') #删除超链接标签    replaceLine=re.compile('<tr>|<div>|</div>|</p>') #把换行的标签换为\n    replaceTD=re.compile('<td>') #将表格制表<td>替换为/t    replacePara=re.compile('<p.*?>') #把段落开头换为\n加空两格    replaceBR=re.compile('<br><br>|<br>') #将换行符或双换行符替换为\n    removeExtraTag=re.compile('<.*?>') #将其余标签删除    def replace(self,x):        x=re.sub(self.removeImg,"",x)        x=re.sub(self.removeAddr,"",x)           x=re.sub(self.replaceLine,"\n",x)        x=re.sub(self.replaceTD,"\t",x)        x=re.sub(self.replacePara,"\n    ",x)        x=re.sub(self.replaceBR,"\n",x)        x=re.sub(self.removeExtraTag,"",x)        return x.strip()class BDTB:    def __init__(self,baseURL,seeLZ,floorTag):        self.baseURL=baseURL        self.seeLZ="?see_lz="+str(seeLZ)        self.tool=Tool()        self.file=None        self.floor=1        self.defaultTitle=u"百度贴吧"        self.floorTag=floorTag        self.user_agent="Mozilla/4.0(compatible;MSIE 5.5;Windows NT)"        self.headers={'User-Agent':self.user_agent}    def getPage(self,pageNum):        try:            url=self.baseURL+self.seeLZ+'&pn='+str(pageNum)            request=urllib2.Request(url,headers=self.headers)            response=urllib2.urlopen(request)            return response.read().decode('utf-8')        except urllib2.URLError as e:            if hasattr(e,'reason'):                print u"连接百度贴吧失败,失败原因",e.reason                return None    def getTitle(self,page):        pattern=re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)        result=re.search(pattern,page)        if result:            return result.group(1).strip()        else:            return None    def getPageNum(self,page):        pattern=re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)        result=re.search(pattern,page)        if result:            return result.group(1).strip()        else:            return None    def getContent(self,page):        pattern=re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)        items=re.findall(pattern,page)        contents=[]        for item in items:            content="\n"+self.tool.replace(item)+"\n"            contents.append(content.encode('utf-8'))        return contents    def setFileTitle(self,title):        if title is not None:            self.file=open(title+".txt","w+")        else:            self.file=open(self.defaultTitle+".txt","w+")    def writeData(self,contents):        for item in contents:            if self.floorTag=='1':                floorLine="\n"+str(self.floor)+"------------------------------------------------------------------\n"                self.file.write(floorLine)            self.file.write(item)            self.floor+=1    def start(self):        indexPage=self.getPage(1)        pageNum=self.getPageNum(indexPage)        title=self.getTitle(indexPage)        self.setFileTitle(title)        if pageNum==None:            print "URL已失效,请重试"            return        try:            print "该帖子共有"+str(pageNum)+"页"            for i in range(1,int(pageNum)+1):                print "正在写入第"+str(i)+"页"                page=self.getPage(i)                contents=self.getContent(page)                self.writeData(contents)        except IOError,e:            print "写入异常,原因"+e.message        finally:            print "写入任务完成"print u"请输入帖子代号"baseURL='https://tieba.baidu.com/p/'+str(raw_input(u'http://tieba.baidu.com/p/'))seeLZ=raw_input("是否只获取楼主发言,是输入1,否输入0\n")floorTag=raw_input("是否写入楼层信息,是输入1,否输入0\n")bdtb=BDTB(baseURL,seeLZ,floorTag)bdtb.start()

学习了崔庆才 Python爬虫实战二之爬取百度贴吧帖子

原创粉丝点击