爬虫学习日记二:相关图片的爬取(附代码)
来源:互联网 发布:济宁网络问政平台登录 编辑:程序博客网 时间:2024/04/26 01:32
1.运行环境
Win 64 Python 2.7
2.附加依赖包
Requests BeautifulSoup
3.实现步骤
- 在F盘创建ImageCri文件夹
- 运行程序
4.代码
#coding:utf-8import urllibimport urllib2from bs4 import BeautifulSoupimport reimport osimport requestsimport timeimport randomdef get_url_onepage(url): #打开URL reponse_url1 = urllib2.urlopen(url) #BS进行处理 soup = BeautifulSoup(reponse_url1) #正则表达式搜索到自己想要的连接 pattern=re.compile(r'/arthtml/(\d)+(.)html') all_link_inonepage=[] #查看需要的URL for link in soup.find_all('a'): link_match=link.get('href') match=pattern.match(link_match) if match: a='http://www.99v3.com'+match.group() all_link_inonepage.append(str(a)) return all_link_inonepagedef get_head_url(start_page,end_page): if start_page==1: return None #访问不同的连接 head_list=['http://www.99v3.com/arttypehtml/2-{}.html'.\ format(str(i)) for i in range(start_page,end_page)] return head_list#获取图片def cripp_image(url): print '*'*20 print url print '*'*20 rest_time=random.random()*10 time.sleep(rest_time) #加载Mozilla的header headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url=url , headers=headers ) reponse_url1 = urllib2.urlopen(req) soup_image=BeautifulSoup(reponse_url1) title=soup_image.title.string.split('[')[0] print title newfolder = os.mkdir('F:\\ImageCri\\' + title) image_num=0 for link in soup_image.find_all('img'): link_match=link.get('src') print link_match # image_num=image_num+1 image_name = 'F:\\ImageCri\\' + title + '\\'+'{}.jpg'.format(str(image_num)) image_num=image_num+1 # image_name='{}.jpg'.format(str(image_num)) rest_time = random.random() * 10 time.sleep(rest_time) # content = requests.get(link_match).content # with open(image_name, "wb") as f: # f.write(content) urllib.urlretrieve(link_match,image_name)#得到所有的URLdef get_all_url_2(start,end): url=['http://www.b9f7.com/AAtupian/AAAtb/zipai/index-{}.html'.format(str(x)) for x in range(start,end)] all_link=[] for i in url: #加载header onepage=[] headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url=i , headers=headers ) time.sleep(random.random()*4) reponse_url1 = urllib2.urlopen(req) pattern = re.compile(r'/AAtupian/AAAwz/(\d)+(\w)+(.)html') soup_image = BeautifulSoup(reponse_url1) for link in soup_image.find_all('a'): link_match = link.get('href') if link_match != None: match = pattern.match(link_match) if match: c = match.group() a = 'http://www.b9f7.com' + match.group() all_link.append(a) return all_link#得到图片def get_img_2(url): str_url=str(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url=str_url , headers=headers ) time.sleep(random.random() * 4) reponse_url1 = urllib2.urlopen(req) soup_image = BeautifulSoup(reponse_url1) title = unicode(soup_image.title.string.split('>')[1].split('<')[0]) #保存到图片的路径,这里是可以修改的 os.mkdir('F:\\ImageCri\\' + title) image_num=0 for link in soup_image.find_all('img'): link_match = link.get('src') print link_match image_num=image_num+1 image_name = 'F:\\ImageCri\\' + title + '\\' + '{}.jpg'.format(str(image_num)) image_num = image_num + 1 rest_time = random.random() * 4 time.sleep(rest_time) content = requests.get(link_match,timeout=180).content with open(image_name, "wb") as f: f.write(content)#RUN enjoy:)if __name__=="__main__": itera=0 for one_html in get_all_url_2(6,10): itera=itera+1 print '第一个Page'+'-'*50 try: print one_html get_img_2(one_html) except: continue #print get_all_url_2(3,5) # head_in_each_page=get_head_url(6,10) # all_link=[] # for i in head_in_each_page: # all_link.append(get_url_onepage(i)) # print all_link # for i in range(len(all_link)): # for j in all_link[i]: # cripp_image(j) #---------------------------------------------- # cripp_image('http://33img.com/upload/image/20170315/31500003229.jpg') # urllib.urlretrieve('http://33img.com/upload/image/20170315/31500003229.jpg', 'hao.jpg') # url = 'http://p.urlpic.club/2016/upload/image/20170302/30200546071.jpg' # content = requests.get(url).content # with open("22221.jpg", "wb") as f: # f.write(content) # urllib.urlretrieve(url, '2222.jpg') #--------------------------------------------- # url=['http://www.99v3.com/arthtml/{}.html'.format(str(x)) for x in range(1889,1900)] # print url # for url_demo in url: # cripp_image(url_demo) #------------------------------- # cripp_image('http://www.99v3.com/arthtml/1886.html') # link_match='https://pic.bb164.com/d4/3030/303018-3.jpg' # content = requests.get(link_match).content # with open('hao.jpg', "wb") as f: # f.write(content) #-------------------------------- # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # req = urllib2.Request( # url="http://www.b9f7.com/AAtupian/AAAtb/zipai/" # , # headers=headers # ) # reponse_url1 = urllib2.urlopen(req) # pattern = re.compile(r'/AAtupian/AAAwz/(\d)+(\w)+(.)html') # soup_image = BeautifulSoup(reponse_url1) # for link in soup_image.find_all('a'): # link_match = link.get('href') # link_name = link_match.text() # print link_name # if link_match!=None: # match = pattern.match(link_match) # if match: # c = match.group() # a = 'http://www.b9f7.com' + match.group() # print a #-------------------------------------------------------------- # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' # } # req = urllib2.Request( # url="http://www.b9f7.com/AAtupian/AAAwz/8139e3a4b82a184cb05cd7e007e4aabd.html" # , # headers=headers # ) # reponse_url1 = urllib2.urlopen(req) # soup_image = BeautifulSoup(reponse_url1) # title = unicode(soup_image.title.string.split('>')[1].split('<')[0]) # os.mkdir('F:\\ImageCri\\' + title) # image_num=0 # for link in soup_image.find_all('img'): # link_match = link.get('src') # print link_match # image_num=image_num+1 # image_name = 'F:\\ImageCri\\' + title + '\\' + '{}.jpg'.format(str(image_num)) # image_num = image_num + 1 # rest_time = random.random() * 10 # time.sleep(rest_time) # content = requests.get(link_match).content # with open(image_name, "wb") as f: # f.write(content) #-------------------------------------------------------------------------------- # soup_image = BeautifulSoup(reponse_url1) # title = soup_image.title.string.split('[')[0] # print title5.效果
0 0
- 爬虫学习日记二:相关图片的爬取(附代码)
- 爬虫爬虫学习: 爬取网页图片
- 爬虫学习日记一:带有搜索框的爬取(百度、搜狗)
- python网络爬虫爬取图片代码
- Python爬虫学习笔记(1):简单的图片爬取
- tensorflow学习——爬虫(headers爬取图片)
- 【Python3.6爬虫学习记录】(一)爬取简单的静态网页图片
- python爬虫学习--pixiv爬虫(2)--国际排行榜的图片爬取
- python网络爬虫学习(二)一个爬取百度贴吧的爬虫程序
- 爬虫-爬取网站上的图片
- python爬虫----简单的图片爬取
- python爬虫爬取好看的图片
- python学习(7):python爬虫之爬取动态加载的图片,以百度图片为例
- 入门级别的Python爬虫代码 爬取百度上的图片
- Java爬虫:爬取豆瓣图片之代码
- Python爬虫实例(5)-简单的爬虫,用XML爬取卡车图片
- Scrapy框架抓取豆瓣电影的小爬虫学习日记(二)
- python爬虫爬取图片
- Docker实战小记
- [深度学习论文笔记][ECCV 16]Embedding Deep Metric for Person Re-identication A Study Against Large Variations
- 项目搭建中的遇到的问题
- opencv 鼠标选取图像感兴趣区域
- 存储登录密码及账号
- 爬虫学习日记二:相关图片的爬取(附代码)
- 面向对象高级( 随手记-4)
- 从np.random.normal()到正态分布的拟合
- 矩阵计算一些重要的结论及其证明
- Zoj 3956 Course Selection System【01背包】
- Telephone--短信发送/接收流程
- JZOJ1275.【USACO题库】2.4.5 Fractions to Decimals分数化小数
- c++第四次作业报告 项目2.3.4
- API的学习:TCP&&UDP