python爬虫学习-爬取百度贴吧图片并保存

来源:互联网 发布:centos 高可用nfs 编辑:程序博客网 时间:2024/05/17 07:26
   计划学编程好久了,最终决定还是从python入手。   最近在学习python,跟着崔庆才的博客(<a target=_blank href="http://cuiqingcai.com/">http://cuiqingcai.com/</a>)学了一下爬虫知识,结合他关于抓取贴吧帖子以及淘宝MM的的两个教程,我对编程毕竟木,通过修改并完善,花了两个晚上的功夫才调试通,通过这个小程序,学会了一些python爬虫的基本内容,虽然网上相关的内容不计其数,但是还是决定把自己的第一篇程序贴出来吧。各位勿喷。    本程序原封不动的保留了我添加的调试信息,由此能更好的理解程序运行的过程。
</pre><pre name="code" class="python">
</pre><pre name="code" class="python">
</span>
<span style="font-size:24px;"></span><pre name="code" class="python"># -*- coding:utf-8 -*-'''Time: 2015-7-26 下午8:54:01题目:爬取保存指定帖子的图片,并以页数+当页第几个图片命名参考:http://cuiqingcai.com/993.html http://cuiqingcai.com/1001.htmlpython version:3.4.3@auther YingJue'''import urllibimport urllib.requestimport urllib.errorimport reimport osfrom pip._vendor.distlib.compat import raw_inputclass Tool:    #去除img标签,1-7位空格,     removeImg = re.compile('<img.*?>| {1,7}| ')    #删除超链接标签    removeAddr = re.compile('<a.*?>|</a>')    #把换行的标签换为\n    replaceLine = re.compile('<tr>|<div>|</div>|</p>')    #将表格制表<td>替换为\t    replaceTD= re.compile('<td>')    #将换行符或双换行符替换为\n    replaceBR = re.compile('<br><br>|<br>')    #将其余标签剔除    removeExtraTag = re.compile('<.*?>')    #将多行空行删除    removeNoneLine = re.compile('\n+')    def replace(self,x):        x = re.sub(self.removeImg,"",x)        x = re.sub(self.removeAddr,"",x)        x = re.sub(self.replaceLine,"\n",x)        x = re.sub(self.replaceTD,"\t",x)        x = re.sub(self.replaceBR,"\n",x)        x = re.sub(self.removeExtraTag,"",x)        x = re.sub(self.removeNoneLine,"\n",x)        #strip()将前后多余内容删除        return x.strip()class tieba:    def __init__(self,baseUrl2,seeLZ1):                 #初始化,传入基地址,是否只看楼主的参数        self.baseUrl = baseUrl2                self.seeLZ = '?see_lz='+str(seeLZ1)        print('kanlouzhu' + self.seeLZ)        self.file = None        self.floor = 1                       self.tool = Tool()    def getPage(self,pageNum):        try:            Url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)            print('url',Url)            request = urllib.request.Request(Url)            response = urllib.request.urlopen(request)            print('getpage')            return response.read().decode('utf-8')        except urllib.error.URLError as e:            if hasattr(e,'reason'):                print('连接失败,错误原因:',e.reason)                return None        #获取帖子标题          def getTitle(self,page):        pattern = re.compile('<h1 class="core_title_txt.*?</span>(.*?)</h1>',re.S)        result = re.search(pattern, page)        if result:            print('gettitle')            return result.group(1).strip()        else:            return None            #获取帖子页数    def getPageNum(self,page):        pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>', re.S)        print('duyeshu')        result = re.search(pattern,page)        if result:            print('getpagenum')            return result.group(1).strip()        else:            return None        #获取所有图片    def getImg(self,page):        #以下注释的两行,如果保留,只会获取每页第一贴的内容中的图片,注释后,获取的是整页的图片        #pattern = re.compile('<div id="post_content_.*?>(.*?)></div>', re.S)       # content = re.search(pattern,page)              #正则表达式,根据需要,随时改进        patternImg = re.compile('<img class="BDE_Image.*? src="(.*?)" pic_ext.*?>', re.S)                #patternImg = re.compile('<img class="BDE_Image" src="(.*?)" width="560.*?>', re.S)                imgs = re.findall(patternImg,page)        print('getimgs')        return imgs        #保存多张图片    def saveImgs(self,imgs,floor,lujing):        number = 1                       print('发现' + str(floor) + '页共有',len(imgs), '张图片')        for imageURL in imgs:            splitpath = imageURL.split('.')            fTail = splitpath.pop()            if len(fTail) > 3:                fTail = 'jpg'                        filename = floor + str(number) + '.' + fTail            number += 1                                   print('saveimgs')            self.saveImg(imageURL,filename,lujing)                def saveImg(self,imageURL,filename,lujing):        u = urllib.request.urlopen(imageURL)        data = u.read()        f = open( str(lujing) + '\\' + filename,'wb')        f.write(data)        print('正悄悄保存一张图片为',filename)        f.close()            def mkdir(self,path):        path = path.strip()        isExists = os.path.exists(path)        print('mkdir')        if not isExists:            print('创建了名字叫做',path,'的文件夹')            os.makedirs(path)                        print('路径1111111111')            return path        else:            print('名为',path,'的文件夹已存在')            print('路径2',path)            return path                def start(self):        indexPage = self.getPage(1)      #  print('1')        pageNum = self.getPageNum(indexPage)       # print('2')        title = self.getTitle(indexPage)        print('3')                print('4')        path = self.mkdir('d:\\pa\\' + title)        print('path',path)                print('7')        if pageNum == None:            print ("URL已失效,请重试")            return        try:            print ("该帖子共有" + str(pageNum) + "页")            for i in range(1,int(pageNum)+1):                print ("正在写入第" + str(i) + "页数据")                page = self.getPage(i)                                images = self.getImg(page)                self.saveImgs(images,str(i),path)        #出现写入异常        except IOError as e:            print ("写入异常,原因" + e.message)        finally:            print ("写入任务完成")                        #print ("请输入帖子代号")baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))seeLZ =  raw_input("是否只获取楼主发言,是输入1,否输入0\n")print(seeLZ)bdtb = tieba(baseURL,seeLZ)print('abccc',baseURL,seeLZ)bdtb.start()                    


<span style="font-size:24px;"></span>

                                             
0 0