贴吧壁纸爬虫1.2

来源:互联网 发布:在线拍卖系统源码 编辑:程序博客网 时间:2024/04/30 05:31

如果您不幸看到这段程序,我想说的是,此程序有15个线程在工作,请谨慎使用,避免拖垮网站服务器,谢谢!

# -*- coding:utf-8 -*-#------------------------------------------#   程序:贴吧壁纸爬虫#   版本:1.2#   作者:执剑天涯#   日期:2016-08-22#   版本:Python 2.7.7#   操作:无#   功能:爬75页贴吧壁纸#   思想:面向对象#   线程:15个线程#------------------------------------------import urllib,urllib2,reimport threading,os,timeprint time.ctime(time.time())print u'正在保存图片...'num = 0class myThread(threading.Thread):    def __init__(self,threadNum,pageStart,pageEnd):        threading.Thread.__init__(self)        self.threadNum = threadNum        self.pageStart = pageStart        self.pageEnd = pageEnd            def run(self):        for pageIndex in range(self.pageStart,self.pageEnd):            Url = 'http://tieba.baidu.com/p/2460150866?pn='+str(pageIndex+1)            pageIndex += 1            request = urllib2.Request(Url)            response = urllib2.urlopen(request)            page = response.read().decode('utf-8')            patternImg = re.compile('<img pic_type="0" class="BDE_Image" src="(.*?)"',re.S)            Imgs = re.findall(patternImg,page)            for Img in Imgs:                Img = urllib.urlopen(Img)                data = Img.read()                path = 'D:\\tiebaPics'                isExists = os.path.exists(path)                if not isExists:                    os.makedirs(path)                else:                    pass                                global num                if lock.acquire():                    fileName = 'D:\\tiebaPics\%s.jpg' %(num)                    #print num                    num += 1                    File = open(fileName,'wb')                    File.write(data)                    File.close                    lock.release()if __name__ == '__main__':    threads = []    lock = threading.Lock()    for threadNum in range(15):        pageStart = threadNum*5+1        pageEnd = threadNum*5+11        threads.append(myThread(threadNum,pageStart,pageEnd))  #线程数,开始页,结束页    for t in threads:        t.start()    t.join()    print 'That is ok!'    print time.ctime(time.time())


0 0
原创粉丝点击