爬虫初探
来源:互联网 发布:网络语凸凸凸什么意思 编辑:程序博客网 时间:2024/05/23 23:47
对煎·····蛋网的图片进行批量下载,但此脚本下载会漏图(与该网站的限制有关).输出信息时可能存在文字编码问题
#!usr/bin/env python# -*- coding:utf-8 -*-# URl:http://jandan.net/ooxx/page-+str(1-3000)+#commentsimport multiprocessingimport osimport randomimport urllibimport urllib2from multiprocessing import Pool, Queue, cpu_countimport BeautifulSoupimport reimport requests# r=requests.get("//ww3.sinaimg.cn/bmiddle/7c8e8afbjw1dh9yimwp4xj.jpg")# urllib.urlopen("//ww3.sinaimg.cn/bmiddle/7c8e8afbjw1dh9yimwp4xj.jpg")# 保存一张图片import timeimport sockettimeout = 10socket.setdefaulttimeout(timeout)#获得URL并保存图片'''def saveImg(imageURL, fileName): try: u = urllib2.urlopen(imageURL, timeout=10) data = u.read() f = open(fileName, 'wb') f.write(data) f.close() except: print u"图片地址有问题"'''def saveImg(imageURL,pageIndex,fileName): try: # 选择随机的User-Agent,以做辨别 Referer的作用 user_agent = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ", ] agent = {"User-Agent": random.choice(user_agent)} r = urllib2.build_opener() r.addheaders = [("User-agent", agent), ("Accept", "*/*"), ('Referer', 'http://jandan.net/ooxx/page-' + str(pageIndex))] u = r.open(imageURL) # u = urllib2.urlopen(r)#, timeout=20) data = u.read() f = open(fileName, 'wb') f.write(data) f.close() except urllib2.HTTPError, e: print u"图片地址有问题,httpcode:%s" % e.code# 获取图片存放URLdef getAllImgs(pageIndex): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} url = "http://jandan.net/ooxx/page-" + str(pageIndex) + "#comments" # print url request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) r = response.read() #print r # 调用BeautifulSoup库 soup = BeautifulSoup.BeautifulSoup(r).prettify() # print soup # 格式化 # s=soup.find(r"<span.*?righttext.*?<img src=(.*?)/>") pattern = re.compile('<div class="text">.*?<p.*?<img src="(.*?)"', re.S) images = re.findall(pattern, soup) return images #返回list # for item in images: # print itemgetAllImgs(2000)# q=getAllImgs(1)# print q# for pageIndex in range(1,1000):# q = getAllImg(1)# print q# 保存多张写真图片def saveImgs(images, pageIndex=1): number = 1 for imageURL in images: print imageURL splitPath = imageURL.split(".") fTail = splitPath.pop() fileName = str(pageIndex) + "/" + str(number) + "." + fTail if str(imageURL).startswith("h"): saveImg(imageURL, pageIndex,fileName) else: imageURL1 = "http:" + imageURL saveImg(imageURL1,pageIndex, fileName) number += 1#以页数命名创建目录def mkdir(pageIndex=1): path = str(pageIndex) # .strip() # 判断路径是否存在 # 存在 True # 不存在 False isExists = os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 print u"偷偷新建了名字叫做", path, u'的文件夹' # 创建目录操作函数 os.makedirs(path) return True else: # 如果目录存在则不创建,并提示目录已存在 print u"名为", path, '的文件夹已经创建成功' return False# 保存一页的图片def savePageInfo(pageIndex): # 获取一页图片的URL time.sleep(2) images = getAllImgs(pageIndex) # 创建保存目录,以页数命名 mkdir(pageIndex) # 保存所有图像 saveImgs(images, pageIndex)def savePagesInfo(start, end): for i in range(start, end + 1): savePageInfo(i)#利用进程池进行下载def pool(start, end): if __name__ == "__main__": #startTime = time.time() p = Pool(cpu_count()) #cpu_count() list = [] numlist = [] startTime=time.time() ''' for i in xrange(481,490): p = multiprocessing.Process(target=savePageInfo, args=(i,)) numlist.append(p) p.start() p.join() print "process end." time.sleep(5) endTime = time.time() print u"used time is ", endTime - startTime print u"太阳出来爬山破咯喂!!!!!!!!!!!!!!!" ''' for i in xrange(start, end + 1): list.append(i) result = p.map_async(savePageInfo, list) print "Waiting for all subprocess done..." flag = True while (flag): time.sleep(5) proces_count = len(p._cache) if proces_count != 0: print "%s processes running" % len(p._cache) print "开始下载图片的process %s" % os.getpid() else: print u"all processes are finished!!" flag = False p.close() p.join() endTime = time.time() print "xiazaihaole" print u"used time is ", endTime - startTimepool(1,50)
0 0
- 爬虫初探
- Python爬虫:初探多线程爬虫
- 写网络爬虫初探
- 网络爬虫初探
- 爬虫--scrapy 初探
- Python爬虫初探
- 初探python爬虫
- 豆瓣电影爬虫初探
- Python爬虫讲座初探
- Python 爬虫初探
- 网络爬虫初探
- python爬虫初探
- scrapy(一)之初探爬虫
- WebMagic Java爬虫框架初探
- 【笔记】Python爬虫之初探
- 微信群分享:Python网络爬虫初探
- scrapy初探:写一个简单的爬虫
- 爬虫初探(一),获取一个页面
- [python]WindowsError的错误代码详解
- 线程池的简单使用
- 在myeclispse里部署自己的项目到tomcat中
- Altium Designer 2013使用经验
- 剖析Vue原理&实现双向绑定MVVM
- 爬虫初探
- ubuntu samba 配置成不需要用户密码访问
- python getpass模块不起效果
- 绘制一个正五角星的局部上色
- 3-Hive SQL
- app端维持用户登录状态
- 复习1
- java7和java8的垃圾回收
- 大写的尴尬——我是不是看了假的足球赛