python 爬虫图片
来源:互联网 发布:黑马程序员框架视频 编辑:程序博客网 时间:2024/06/05 23:53
#!/usr/bin/env python# encoding: utf-8'''@author: caopeng@license: (C) Copyright 2013-2017, Node Supply Chain Manager Corporation Limited.@contact: deamoncao100@gmail.com@software: garner@file: movie1.py@time: 2017/9/16 0016 14:49@desc:'''import threadingimport timeimport urllibimport urllib.requestimport os,queue,refrom bs4 import BeautifulSoupdef getUrl(name,hostUrls,girlsUrls,flag): while not flag.isSet(): user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: hostUrl=hostUrls.get(timeout=2) except queue.Empty: print("queue empty") return request=urllib.request.Request(hostUrl,headers=headers) response=urllib.request.urlopen(request) data=response.read().decode('gbk') soup=BeautifulSoup(data) tag_lady=soup.find_all("a",attrs={"class":"lady-avatar"}) for tag_href in tag_lady: girlsUrls.put("https:"+tag_href['href']) print("录入:https:"+tag_href['href']) hostUrls.task_done() print("getUrl is working")def getImg(name,girlsUrls,flag): while not flag.isSet(): user_agent ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240' headers={'User-Agent':user_agent} try: ur=girlsUrls.get(timeout=5) except queue.Empty: print(name+" imgqueue empty") return pattern=re.compile(r"/(\d+).htm") items=pattern.findall(ur) girlUrl="https://mm.taobao.com/self/aiShow.htm?userId="+items[0] request=urllib.request.Request(girlUrl,headers=headers) response=urllib.request.urlopen(request) data=response.read() soup=BeautifulSoup(data) fileName=soup.head.title.contents fileName[0]=fileName[0].rstrip() tag_div=soup.find('div',attrs={"class":"mm-aixiu-content"}) imgs=tag_div.find_all("img",attrs={}) if len(imgs)==0: girlsUrls.task_done() return path=cdir+'/'+str(fileName[0]) if not os.path.exists(path): os.makedirs(path) n=0 for img in imgs: n=n+1 link=img.get('src') if link: s="http:"+str(link) i=link[link.rfind('.'):] try: request=urllib.request.Request(s) response=urllib.request.urlopen(request) imgData=response.read() pathfile=path+r'/'+str(n)+i with open(pathfile,'wb') as f: f.write(imgData) f.close() print("thread "+name+" write:"+pathfile) except: print(str(name)+" thread write false:"+s) girlsUrls.task_done()#start=time.time()if __name__=='__main__': start=time.time() hostUrls=queue.Queue() girlsUrls=queue.Queue() cdir=os.getcwd() url='https://mm.taobao.com/json/request_top_list.htm?page=' flag_girl=threading.Event() flag_img=threading.Event() for i in range(1,3): u=url+str(i) hostUrls.put(u) threads_girl = threading.Thread(target=getUrl, args=(str(1), hostUrls,girlsUrls,flag_girl)) threads_img = [threading.Thread(target=getImg, args=(str(i+1), girlsUrls,flag_img)) for i in range(8)] threads_girl.start() while(girlsUrls.empty()): print("wait..") time.sleep(0.1) for t in threads_img: t.start() hostUrls.join() flag_girl.set() girlsUrls.join() flag_img.set() for t in threads_img: t.join() end=time.time() print("run time:"+str(end-start))
阅读全文
0 0
- python多线程图片爬虫
- python爬虫抓取图片
- python实现图片爬虫
- python多线程图片爬虫
- python图片小爬虫
- python 爬虫 爬下图片
- Python网页图片爬虫
- Python爬虫,抓图片
- [Python爬虫]爬取贴吧图片
- Python 图片爬虫
- python 图片小爬虫
- python爬虫之图片
- Python爬虫网页图片
- Python简单图片爬虫
- python 爬虫下载图片
- python图片爬虫
- python爬虫网站图片
- python 爬虫图片
- xib关联不到文件里
- jenkins服务器的搭建和配置
- offer40--和为s的连续正数序列
- conflicts with existing, non-compatible bean definition of same name and class
- Java常见集合框架(四):List之AbstractSequentialList、LinkedList
- python 爬虫图片
- Android系统启动流程(一)解析init进程启动过程
- [转]JavaScript 数字与字符串 比较大小
- Axure学习之百度登录界面原型图
- UART串口通信浅谈之(三)--字符与数据的转换
- java基础_设计模式_设计基础(小鸭子游戏)
- 客户端js(BOM&DOM)
- Android ADB安装和卸载或删除Android设备中的APP
- java项目前后台怎么交互详解