Python的爬虫

来源：互联网发布：怎么进入淘宝网开店编辑：程序博客网时间：2024/04/29 06:09

下载一个网页的图片：

#-*- coding= utf-8 -*-import urllibimport redef getHtml(url):    page = urllib.urlopen(url)    html = page.read()    return htmldef getImg(html):    #格式的匹配    reg = r'src="(.+?\.jpg)" pic_ext'    imgre = re.compile(reg)    imgList = re.findall(imgre, html)    x= 0    for imgurl in imgList:        #下载的主要语句        img = urllib.urlretrieve(imgurl, r"D://picture/%s.jpg" %x)        x = x+1        print img#下载页面的地址        html = getHtml("http://tieba.baidu.com/p/2460150866")print getImg(html)

#-*- coding=utf-8 -*-import urllib2import urllibimport reimport HTMLParserimport time,oshost = "http://desk.zol.com.cn"startImageUrl =''localSavePath = 'D:\\picture\\'ISOTIMEFORMAT='%Y%m%d%H%M%S'def downloadImage(url):    imgRe = '[0-9]*\.jpg'    match = re.search(imgRe, url)    if match:        print "Downloading image begin" ,url        filename = localSavePath + str(time.strftime(ISOTIMEFORMAT))+ r'.jpg'        img = urllib.urlretrieve(url, filename)    else:        print "NO match"   def getImageUrlByHtmlUrl(htmlUrl):    parser = MyHtmlParser(False)    request = urllib2.Request(htmlUrl)    try:        response = urllib2.urlopen(request)        content = response.read()        parser.feed(content)    except urllib2.URLError, e:        print e.reason        class MyHtmlParser(HTMLParser.HTMLParser):    def __init__(self,isIndex):        self.isIndex = isIndex        HTMLParser.HTMLParser.__init__(self)    def handle_starttag(self, tag, attrs):        if(self.isIndex):            if(tag == 'a'):                if(len(attrs) == 4):                    if(attrs[0] == ('class','pic')):                        newUrl = host + attrs[1][1]                        print "Find a image site: ",newUrl#Question 这句话去掉就会只有一个网页的图片 global 定义的用法                        startImageUrl = newUrl                        getImageUrlByHtmlUrl(newUrl)                                                               else:            if(tag == 'img'):                if(attrs[0] == ('id','bigImg')):                    imgUrl = attrs[1][1]                    print " one image : " ,imgUrl                    downloadImage(imgUrl)                                    if(tag == 'a'):                if(len(attrs) == 4):                    if(attrs[1] == ('class','next')):                        nextUrl = host + attrs[2][1]                        print "Find a next image Link" ,nextUrl                        global startImageUrl                        if( nextUrl != startImageUrl ):                            getImageUrlByHtmlUrl(nextUrl)                                                       if __name__ == "__main__":    indexUrl = "http://desk.zol.com.cn/meinv/"            page = urllib2.urlopen(indexUrl).read()    parseIndex = MyHtmlParser(True)       parseIndex.feed(page)

API: http://blog.csdn.net/tianxicool/article/details/5942523

0 0