python数据挖掘
来源:互联网 发布:淘宝网首页官网床垫 编辑:程序博客网 时间:2024/06/05 17:25
#!user/bin/env python3 # -*- coding: UTF-8 -*- import urllib.requestimport reimport osimport sysfrom multiprocessing.dummy import Pooldef name_read(): names=[] input_name=[] judge=[] with open('secondname.txt','r',encoding='UTF-8')as f: read_data=f.read() input_name=read_data.split('\n') f.close() with open('names.txt','r',encoding='UTF-8')as f: read_data=f.read() names=read_data.split('\n') f.close() for name in input_name: if not(name in names): if not(name in judge): judge.append(name) return judgedef name_write(name): os.chdir(sys.path[0]) f=open('names.txt','a',encoding='UTF-8') f.writelines(name+'\n') f.close() def download(download_info): (url, file_name) = download_info for i in range(6): try: global CONSTANT CONSTANT+=1 file_name=str(CONSTANT)+".jpg" with urllib.request.urlopen(url, timeout=2) as response, open(file_name, 'wb') as out_file: data = response.read() out_file.write(data) return except: CONSTANT-=1 pass #print('Download failed: %s'%(url))def mass_download(urls, nthread): #print('Downloading...') download_infos = [(url, os.path.basename(url)) for url in urls] with Pool(nthread) as pool: pool.map(download, download_infos)def get_html(url_path): #print('Fetching html...') for i in range(5): try: with urllib.request.urlopen(url_path) as url: s = str(url.read()) return s except: pass #print('Fetching html failed...')def get_image_urls(html_content): print('Parsing html...') exp = 'objURL":"([a-z.:/_A-Z0-9]*)"' image_urls = re.findall(exp, html_content) #print('%d images found in this page'%(len(image_urls))) return image_urlsnum_image = 169nthread = 8names=name_read()#ames=["王力宏","高清壁纸"]for name in names: #print('down%s'%name) name_write(name) key_word = repr(name.encode('UTF-8')).replace('\\x', '%').upper()[2:-1] dest_folder=sys.path[0]+'/'+name if os.path.exists(dest_folder): print("文件夹已经存在") continue else: os.makedirs(dest_folder) os.chdir(dest_folder) pn = 0 cnt = 0 CONSTANT = 0 downloaded = set() while cnt < num_image: print("Page %d:"%(pn+1)) image_urls = [] try: url = "http://images.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d&gsm=0"%(key_word, pn*15) html_content = get_html(url) temp_urls = get_image_urls(html_content) for i in temp_urls: if i not in downloaded: downloaded.add(i) image_urls.append(i) mass_download(image_urls, nthread) except KeyboardInterrupt: exit() except: pass pn += 1 cnt += len(image_urls) if(pn>30): print("done") break
阅读全文
0 0
- Python 数据挖掘小结
- python数据挖掘
- python数据挖掘orange
- python 数据挖掘
- python数据挖掘
- Python数据挖掘-文本挖掘
- Python 数据挖掘推荐模块
- python数据挖掘领域工具包
- python数据挖掘领域工具包
- python数据挖掘领域工具包
- python数据挖掘领域工具包
- python数据挖掘领域工具包
- 数据挖掘 R VS Python
- python数据挖掘常用包
- 利用 Python 练习数据挖掘
- 利用 Python 学习数据挖掘
- python 数据挖掘基础 入门
- python数据挖掘领域工具包
- org.hibernate.StaleStateException
- 中国IT界的风投
- Keras入门课3 -- 使用CNN识别cifar10数据集
- Codeforces #441 Div1.D:(扩展Lucas)
- leetcode candy分糖果问题 两次遍历和一次遍历
- python数据挖掘
- Vuejs全家桶系列(三)--- 生命周期和钩子函数
- php debug下载 对应版本
- C语言趣味一百道 第16道 2017_12_18
- ubuntu16.04安装Hadoop及Spark
- 责任链模式的简单使用
- CSS书写规范
- 树莓派3B Linux下设置程序自启动2
- 蓝桥杯ALGO-6安慰奶牛