python爬取并下载一个俄语植物网站上的图片

来源:互联网 发布:淘宝零食店铺介绍 编辑:程序博客网 时间:2024/04/27 15:54

网站链接:http://www.plantarium.ru/page/samples/taxon/41302.html

网站的图片需要逐级进去,而且打开缓慢容易出错,所以打算把图片下下来便于查找,于是便有了这个小爬虫。


# -*- coding: utf-8 -*-import re,os,requests,urllib2,chardet,time,sys   #requests,chardet模块需要自己安装stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderrreload(sys)sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stdesys.setdefaultencoding('utf-8')#只获取网页源代码def only_content(url):    headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}    request = requests.get(url,timeout=20,headers = headers)    content = request.text    return content#获取网页源代码(提取所需内容)def get_content(url,reg):    i=0    p=True    headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}    while p and i<=10:        try:            request = requests.get(url,timeout=20,headers = headers)            content = request.text            want=reg.findall(content)            if want==[]:                i+=1                print 'get none,I will try again'            #   time.sleep(1)            else:                print 'get success!'                p=False        except:            i+=1            print 'get wrong,please wait 2 seconds!'            time.sleep(2)    return want#获取网页源代码(用于转码)-为了解决防止个别网址不是使用的utf-8而乱码def for_change(url,reg):    p=True    headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}    while p:        try:            request=urllib2.Request(url,headers=headers)            req=urllib2.urlopen(request,timeout=20)            res=req.read()            enc=chardet.detect(res)['encoding']            print u'该网页使用'+enc+u'编码'            content=res.decode(enc).encode('utf-8')            want=reg.findall(content)            print 'get success!'            p=False        except:            print 'get wrong,please wait 10 minutes!'            time.sleep(10)    return want#创建文件夹def create_folder(path):    if not os.path.exists(path):        os.mkdir(path)        #保存图片def download_image(imageurl,imagename):    i=0    p=True    while p and i<=10:        try:            data=requests.get(imageurl,timeout=20).content            with open(imagename,'wb') as f:                f.write(data)            p=False        except:            i+=1            print 'save picture wrong,please wait 2 seconds'            time.sleep(2)        #主程序if __name__ == '__main__':    path='D:\\Russian_pictures\\'    create_folder(path)    n=0 #计数    order=[]  #存放目网址    family=[]  #存放科网址    genus=[]  #存放属网址        #提取单双子叶    url="http://www.plantarium.ru"    url1=url+'/page/samples/taxon/41302.html'    a1=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)  #()为无命名组,仅获取括号内内容    u1=get_content(url1,a1)    print u1        #提取目    for u11 in u1:        url2=url+u11        a2=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)        u2=get_content(url2,a2)        u2.pop(0)  #删除第一个目录网址        order.extend(u2)    print 'It has '+str(len(order))+' orders'    #提取科    for u22 in order:        url3=url+u22        a3=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)        u3=get_content(url3,a3)        u3.pop(0)        u3.pop(0)        family.extend(u3)    print 'It has '+str(len(family))+' families'    #提取属    for u33 in family:        url4=url+u33        a4=re.compile(r'href="(/page/samples/taxon.+?.html)',re.I)        u4=get_content(url4,a4)        u4.pop(0)        u4.pop(0)        u4.pop(0)        genus.extend(u4)    print 'It has '+str(len(genus))+' genera'    #下载种 (直接从科里面提)    for u44 in genus:        url5=url+u44        print url5        a5=re.compile(r'href="(/page/view/item/.+?.html)',re.I)        b5=re.compile(r'this,event.+?">(.+?)</a>',re.I)        u5=get_content(url5,a5)        n5=get_content(url5,b5)   #每个科的路径列表        pat=path        for pa in n5:            pat=pat+pa+'\\'            create_folder(pat)        u5=set(u5)   #每个属的所有图片集合        #获取该属图片页数        for u55 in u5:            pp=True            num=0  #设置错误超过次数跳过            url6=url+u55            #此处的俄文用正则表达式没有匹配到,不知道什么原因,有大神了解的请指点下!!            '''            a6=re.compile(r'из (.+?) найденных изображений')            page=int(get_content(url6,a6)[0])/30+1            '''            #这里使用split函数来代替            while pp and num<=10:                try:                    number=only_content(url6).split('найденных изображений')[0].split('Показаны')[1].split('из ')[1]                    print number                    page=int(number)/30+1                    pp=False                    for i in range(0,page):                        url7=url6.replace('view/item','view/part/'+str(i)+'/item')                        a7=re.compile(r'href="(/page/image/id/.+?.html)',re.I)                        u7=get_content(url7,a7)                        #提取每张图片                        for u77 in u7:                            n+=1                            url_every=url+u77                            name_a=re.compile(r'<title>.+?([a-zA-Z]+ +[a-zA-Z]*).+?</title>',re.I)                            image_a=re.compile(r'src="(.+?.jpg)" width=',re.I)                            name=get_content(url_every,name_a)[0].strip()+'-'+str(n)+'.jpg'                            print name                            image_name=pat+name                            image_url=url+get_content(url_every,image_a)[0]                            download_image(image_url,image_name)                            print str(n)+' now'                except:                    num+=1                    print 'page is not get,please wait 2 seconds'                    time.sleep(2)                print 'all '+str(n)+' download over'


0 0
原创粉丝点击