Python爬取高清图片

来源:互联网 发布:员工技能矩阵管理办法 编辑:程序博客网 时间:2024/05/21 15:48

坑了这么久,现在填上。
环境win10,python
之前爬过一些图片的网站:https://images.pexels.com和https://unsplash.com,根据网上找的一些资料和自己看的,现在贴出代码。

import requests  import re  import os  import time  def get_url(url):      kw = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}      try:          r = requests.get(url,headers = kw)          r.raise_for_status()          r.encoding = r.apparent_encoding          return r      except:          print('wrong!!!!!!!!!!!')  def get_photourl(photo_url):      kw = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}      try:          r = requests.get(photo_url,headers = kw)          r.raise_for_status()          r.encoding = r.apparent_encoding          return r      except:          return 'wrong'  def get_photos(url,new_fpath):      result = get_url(url)      pattern = re.compile(r'src="https://images.pexels.com/photos/(\d+)/(.*?)\?h=350&auto=compress&cs=tinysrgb"', re.S)    #真正的下载链接是static,不是images开头      #print(result.text)    items = re.findall(pattern, result.text)    #print(items)    for item in items:          try:              photo_url = 'https://static.pexels.com/photos/' + str(item[0]) + '/' + str(item[1])            #把图片链接中的images,改成了static            print(photo_url)            save(photo_url,item,new_fpath)              time.sleep(1)          except:              continue  def makedir(new_fpath,i,key):      E = os.path.exists(new_fpath)      if not E:          os.makedirs(new_fpath)          os.chdir(new_fpath)          print('文件夹'+ key + '_page' + str(i + 1) + '创建成功!')      else:          print('文件夹已存在!')  def save(photo_url,item,new_fpath):      Final_fpath = new_fpath + '/' +str(item[0])+str(item[1])      print('正在下载图片......')      result = get_photourl(photo_url)      if result != 'wrong':          print('下载成功!')      else:          print('失败')      E = os.path.exists(Final_fpath)      if not E:          try:              with open(Final_fpath,'wb') as f:                  f.write(result.content)          except:              print('下载失败!')      else:          print('图片已存在')  def main():      key = input('请输入搜索关键词(英文):')      url =  'https://www.pexels.com/search/' + key + '/'      num = int(input('请输入一共要下载的页数:'))#默认从第1页开始下载      fpath = '*****'      for i in range(2,num):          new_fpath = fpath + '/Photo2.0/' + key + '_page' + str(i + 1)          makedir(new_fpath,i,key)          if i >= 1:              new_url = url + '?page=' + str(i + 1)            print(new_url)            get_photos(new_url,new_fpath)          else:              get_photos(url,new_fpath)          time.sleep(3)  main() 

爬取https://unsplash.com的时候需要利用selenium模拟下拉操作。需要pip安装一下,并下载chromedriver放在谷歌的安装目录(Chrome\Application)。

from selenium import webdriver#实现自动下拉from lxml import etree#定位元素(更加高效)from urllib.parse import urlparse#解析图片的名称import urllib.request#urlretrieve()下载保存图片import reimport timeclass Unsplash:    #初始化构造函数    def __init__(self):        self.url='https://unsplash.com/search/photos/label'#请求地址        self.save_path="****"#图片保存路径        self.driver=webdriver.Chrome()        #self.driver = webdriver.PhantomJS()    #实现下拉动作,并返回网页源代码,times:下拉次数    def do_scroll(self,times):        #打开目标网址        driver=self.driver        driver.get(self.url)        #模拟下拉操作        for i in range(times):            print('正在下拉'+str(i+1)+'次:')            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")            print('等待'+str(i+1)+'次网页加载')            time.sleep(40)        #解析网页源码        #html=etree.HTML(driver.page_source)        html = driver.page_source        return html    #下载图片保存到本地    def save_img(self,src,img_name):        urllib.request.urlretrieve(src, filename=self.save_path + img_name)    def get_pic(self, html):        #获取a标签的style内容        #all_uls = html.xpath('//a[@class="cV68d"]/@style')        pattern = re.compile(r'img src="https://images.unsplash.com/photo(.*?)"',re.S)        items = re.findall(pattern, html)        # 获取图片下载地址,        count = 1        for url in items:            #使用正则表达式获取想要的src地址            #src = re.findall(r'url\(\"(.*?)\"\)',url,re.S)[0]            url = 'https://images.unsplash.com/photo'+url            #print(url)            #使用urlparse解析地址,使用path的内容,去除不需要的参数            #fina_src=urlparse(' ' + src).path.strip()            # 保存图片的名字            #img_name = fina_src.split('/')[-1]+'.jpg'            #print(fina_src,img_name)            count += 1            img_name = str(count)+'.jpg'            self.save_img(url,img_name)    def main(self):        #获取源码        html=self.do_scroll(20)        print("开始下载图片")        self.get_pic(html)img=Unsplash()img.main()
原创粉丝点击