Python爬取网页图片02

来源:互联网 发布:数据库系统实现英文版 编辑:程序博客网 时间:2024/06/03 23:50
#coding=utf-8import reimport timeimport osimport _osx_supportimport urllib.requestbaseFiledirs='D:\img\images/'# baseFiledir='D:\img/'# 获取网页def getHtml(url):    page = urllib.request.urlopen(url)    html = page.read()    html = html.decode('UTF-8')    return html# 获取图片def getImg(html):    # r'^http(s)?://.+\.(jp(e)?g|png)$'    # reg = r'img class="BDE_Image" src="(.+?\.jpg)"'    # reg = r'img src="(.+?\.jpg)"'    # reg = r'src="(.+?\.jpg|.+?\.png)"'    reg = r'img .+?\ src="(.+?\.jpg|.+?\.png)"* '    # reg = r'^http(s)?://.+\.(jp(e)?g|png)$'    imgre = re.compile(reg)    # print(type(imgre))    # print(imgre)    imglist = re.findall(imgre,html)    # print(type(imglist))    print(imglist)    num = 0    for imgurl in imglist:        # urllib.request.urlretrieve(imgurl,'D:\img\hardaway%s.jpg' %num)        # urllib.request.urlretrieve(imgurl,'D:\img' +time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())+"\%s.jpg")#不可用        # urllib.request.urlretrieve(imgurl,'D:\img\%s' %num+".jpg")        # imgurl.replace("https","")        # imgurl.replace("http","")        # paths=imgurl.split('/')        # del paths[len(paths)-1]        # del paths[0]        # del paths[0]        # path=baseFiledir        # for shortPath in paths:        #     path+=shortPath+"/"        #        # # print(num,path)        #        # if os.path.exists(path)==False:        #     os.makedirs(path)        #        # print(num, path+"/"+imgurl.split('/')[-1])        # urllib.request.urlretrieve(imgurl,path+imgurl.split('/')[-1])#将所有文件保存到相应路径        print(imgurl)        try:            if imgurl.startswith("//"):                imgurl="httP:"+imgurl            if os.path.exists(baseFiledirs) == False:                os.makedirs(baseFiledirs)            urllib.request.urlretrieve(imgurl,baseFiledirs+imgurl.split('/')[-1])#将所有文件保存到指定目录            num += 1        except BaseException:            print("errorurl:",imgurl)    return "success"# html = getHtml("http://tieba.baidu.com/p/1569069059")# html = getHtml("http://www.cankaoxiaoxi.com/roll10/bd/20170425/1926008.shtml")# html = getHtml("http://www.bilibili.com")# html = getHtml("http://www.bilibili.com/blackboard/activity-B1bzUVG0l.html")html = getHtml("http://news.baidu.com/")# print("html内容:",html)print(getImg(html))
原创粉丝点击