python数据挖掘

来源:互联网 发布:淘宝网首页官网床垫 编辑:程序博客网 时间:2024/06/05 17:25
#!user/bin/env python3  # -*- coding: UTF-8 -*- import urllib.requestimport reimport osimport sysfrom multiprocessing.dummy import Pooldef name_read():    names=[]    input_name=[]    judge=[]    with open('secondname.txt','r',encoding='UTF-8')as f:        read_data=f.read()        input_name=read_data.split('\n')        f.close()        with open('names.txt','r',encoding='UTF-8')as f:        read_data=f.read()        names=read_data.split('\n')        f.close()    for name in input_name:        if not(name in names):            if not(name in judge):                judge.append(name)    return judgedef name_write(name):            os.chdir(sys.path[0])        f=open('names.txt','a',encoding='UTF-8')                 f.writelines(name+'\n')        f.close()            def download(download_info):    (url, file_name) = download_info        for i in range(6):        try:                        global CONSTANT            CONSTANT+=1            file_name=str(CONSTANT)+".jpg"            with urllib.request.urlopen(url, timeout=2) as response, open(file_name, 'wb') as out_file:                data = response.read()                 out_file.write(data)            return        except:            CONSTANT-=1            pass    #print('Download failed: %s'%(url))def mass_download(urls, nthread):    #print('Downloading...')    download_infos = [(url, os.path.basename(url)) for url in urls]        with Pool(nthread) as pool:          pool.map(download, download_infos)def get_html(url_path):    #print('Fetching html...')    for i in range(5):        try:            with urllib.request.urlopen(url_path) as url:                s = str(url.read())            return s        except:            pass    #print('Fetching html failed...')def get_image_urls(html_content):    print('Parsing html...')    exp = 'objURL":"([a-z.:/_A-Z0-9]*)"'    image_urls = re.findall(exp, html_content)    #print('%d images found in this page'%(len(image_urls)))    return image_urlsnum_image = 169nthread = 8names=name_read()#ames=["王力宏","高清壁纸"]for name in names:    #print('down%s'%name)    name_write(name)    key_word = repr(name.encode('UTF-8')).replace('\\x', '%').upper()[2:-1]    dest_folder=sys.path[0]+'/'+name                if os.path.exists(dest_folder):        print("文件夹已经存在")        continue    else:        os.makedirs(dest_folder)        os.chdir(dest_folder)    pn = 0    cnt = 0    CONSTANT = 0     downloaded = set()    while cnt < num_image:        print("Page %d:"%(pn+1))                image_urls = []        try:            url = "http://images.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d&gsm=0"%(key_word, pn*15)            html_content = get_html(url)            temp_urls = get_image_urls(html_content)            for i in temp_urls:                                                if i not in downloaded:                    downloaded.add(i)                    image_urls.append(i)            mass_download(image_urls, nthread)        except KeyboardInterrupt:            exit()        except:            pass        pn += 1        cnt += len(image_urls)               if(pn>30):            print("done")            break