Python 爬虫：获取网页图片

来源：互联网发布：jqplot 动态数据编辑：程序博客网时间：2024/05/17 23:10

脚本为获取某一个网页页面上的图片：（36 氪为栗）

#coding=utf-8import urllibimport reimport osweburl = "http://36kr.com/" #爬取网页tardir = "F:\\0000\\kk"     #保存路径def getHtml(url):    page = urllib.urlopen(url)    html = page.read()    return htmldef destDir(path):    if not os.path.isdir(path):         os.makedirs(path)    p = path.split('\\')[-1]    if not (p==''):        path = path + '\\'    return pathdef getSuffix(fileurl):    return fileurl.split('.')[-1]def getImg(html):    reg = r'(http.:[\S]*?.(jpg|jpeg|png|gif|bmp|JPG|JPEG|PNG|GIF|BMP))'    imgall = re.findall(reg,html)    destPath = destDir(tardir)    x = 1    for imgurl,i in imgall:        urllib.urlretrieve(imgurl,destPath+'%s.' % x +getSuffix(imgurl))        print "完成 ".decode('UTF-8').encode('GBK') + imgurl        x+=1html = getHtml(weburl)print getImg(html)os.system("pause")

现在进行升级，下载某个网站各个页面图片：（漏洞盒子为栗）

打开网址 https://www.vulbox.com/board 点击几页数据，可以发现规律，网站最后一个数字为页码变量：

https://www.vulbox.com/board/internet/page/页数，按照这个规律，可以循环读取每个页面！

#coding=utf-8import urllibimport reimport osweburl = "https://www.vulbox.com/board"tardir = "F:\\0000\\kk"def getHtml(url):    page = urllib.urlopen(url)    html = page.read()    return htmldef destDir(path):    if not os.path.isdir(path):         os.makedirs(path)    p = path.split('\\')[-1]    if not (p==''):        path = path + '\\'    return pathdef getSuffix(fileurl):    return fileurl.split('.')[-1]def getImg(html,n):    reg = r'(http.:[\S]*?.(jpg|jpeg|png|gif|bmp|JPG|JPEG|PNG|GIF|BMP))'    imgall = re.findall(reg,html)    destPath = destDir(tardir)    x = 1    for imgurl,i in imgall:        urllib.urlretrieve(imgurl,destPath+'%s_' % n +'%s.' % x +getSuffix(imgurl))        print "完成 ".decode('UTF-8').encode('GBK') + imgurl        x+=1n = 1for n in range(1,6): #取前5页    pageurl = weburl + "/internet/page/" + str(n)    html = getHtml(pageurl)    print getImg(html,n)    print "【完成页面】 ".decode('UTF-8').encode('GBK') + pageurl     n = n + 1os.system("pause")

1 0