Python爬虫_获取贴吧内容

来源:互联网 发布:中国矿工软件 编辑:程序博客网 时间:2024/06/05 10:16

参考http://cuiqingcai.com/993.html,写的第一个比较完整的python爬虫程序。

对正则表达式要有更加深刻的研究,本程序对百度贴吧使用,但是截取不了表情。如果想把所有页面的内容都获取下来不难,拓展一下即可。

# -*- coding:utf-8 -*-import urllibimport urllib2import re#处理页面标签类class Tool:    #去除img标签,7位长空格    removeImg = re.compile('<img.*?>| {7}|')    #删除超链接标签    removeAddr = re.compile('<a.*?>|</a>')    #把换行的标签换为\n    replaceLine = re.compile('<tr>|<div>|</div>|</p>')    #将表格制表<td>替换为\t    replaceTD= re.compile('<td>')    #把段落开头换为\n加空两格    replacePara = re.compile('<p.*?>')    #将换行符或双换行符替换为\n    replaceBR = re.compile('<br><br>|<br>')    #将其余标签剔除    removeExtraTag = re.compile('<.*?>')    def replace(self,x):        x = re.sub(self.removeImg,"",x)        x = re.sub(self.removeAddr,"",x)        x = re.sub(self.replaceLine,"\n",x)        x = re.sub(self.replaceTD,"\t",x)        x = re.sub(self.replacePara,"\n    ",x)        x = re.sub(self.replaceBR,"\n",x)        x = re.sub(self.removeExtraTag,"",x)        #strip()将前后多余内容删除        return x.strip() #百度贴吧爬虫类class BDTB:     #初始化,传入基地址,是否只看楼主的参数    def __init__(self,baseUrl,seeLZ):        self.baseURL = baseUrl        self.seeLZ = '?see_lz='+str(seeLZ)        self.tool = Tool()        self.file = None        self.defaultTitle = u"百度贴吧"     #传入页码,获取该页帖子的代码    def getPage(self,pageNum):        try:            url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)            request = urllib2.Request(url)            response = urllib2.urlopen(request)            content = response.read().decode('utf-8')            #print content            return content        except urllib2.URLError, e:            if hasattr(e,"reason"):                print u"连接百度贴吧失败,错误原因",e.reason                return None                def getTitle(self):        page = self.getPage(1)        pattern = re.compile('<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)        #pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S)        #pattern = re.compile('<h3.*?>(.*?)</h3>',re.S)        #_compile(pattern, flags).search(string)报错就修改,加str        #result = re.search(pattern,str(page))        #findall得到的result是list结构的        result = re.findall(pattern,page)        if result:            #print result.group(1)  #测试输出            #return result.group(1).strip()            #print result.encode('utf-8')            """            因为result是list,只有一维的            for i in range(len(result)):                for j in range(len(result[i])):                    print result[i][j].encode('utf-8')                  """            for i in range(len(result)):                   print '标题:'+ result[i].encode('utf-8') + '\n'                self.file = open(result[i]+ ".txt","w+")                self.file.writelines(result[i].encode('utf-8') + '\n')            self.file.close()            #print result                    return result[0]        else:            return None        #   获取帖子一共有多少页    def getPageNum(self):        page = self.getPage(1)        pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span class="red">(.*?)</span>',re.S)        #result = re.search(pattern,page)        result = re.findall(pattern,page)        if result:            #print result.group(1)  #测试输出            #return result.group(1).strip()            #for i in range(len(result)):               #    print result[i].encode('utf-8')            #print result                    return result[0].encode('utf-8')        else:            return None    #获取每一层楼的内容,传入页面内容    def getContent(self):        page = self.getPage(1)        #pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)        #l_post l_post_bright j_l_post clearfix          #pattern = re.compile('<div class="l_post l_post_bright j_l_post clearfix.*?>(.*?)</div>',re.S)        #pattern = re.compile('<div id="post_content_.*?class="d_post_content j_d_post_content.*?img class=.*?>(.*?)</dic>',re.S)        #pattern = re.compile('<div class="p_content ">(.*?)</div>',re.S)        pattern = re.compile('<div id="post_content.*?>(.*?)</div>',re.S)                items = re.findall(pattern,page)        #print len(items)        floor = 1        #a表示追加模式写入txt        self.file = open(self.getTitle()+ ".txt","a")                for i in range(len(items)):            #print floor,u"楼---------------------------------------\n"            self.file.write(str(floor)+ '楼---------------------------------------\n')            floor += 1            #print self.tool.replace(items[i].encode('utf-8'))            self.file.write(self.tool.replace(items[i].encode('utf-8')) + '\n\n')            print '正在写入第'+str(floor-1)+'楼...'                    self.file.close() #baseURL = 'http://tieba.baidu.com/p/3138733512'#baseURL = 'http://tieba.baidu.com/p/4399969515'#baseURL = 'http://tieba.baidu.com/p/4400019865'baseURL = 'http://tieba.baidu.com/p/4075653034'#下面参数0表示看所有信息,1表示只看楼主信息bdtb = BDTB(baseURL,0)#bdtb.getPage(1)#print bdtb.getTitle()print '该贴吧总共有'+ str(bdtb.getPageNum()) + '页'bdtb.getContent()



0 0
原创粉丝点击