项目进展:淘宝店铺抓取

来源:互联网 发布:软件paris 编辑:程序博客网 时间:2024/04/28 23:35

1.概要:

项目为了抓取淘宝中可能存在的侵犯明星肖像权的行为,目标获取店铺首页的图片

淘宝店铺首页:https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306

为了缩减数据量,只处理大类别

example(女装):https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.S9RdIQ&q=%E5%A5%B3%E8%A3%85&tracelog=shopsearchnoqcat&sort=sale-desc

按照销量排行,取了前120个店铺

通过selenium+phantomjs获取Page_source,

通过re模块获取了图片的链接地址

2.代码

from selenium import webdriverfrom bs4 import BeautifulSoup as bsimport reimport urllibfrom tkinter import *import threading#init driver with phantomJSdriver = webdriver.PhantomJS()#new liststore_list = []#init total_counttotal_count = 0#init mutexmutex = threading.Lock()def get_item_href():    driver = webdriver.PhantomJS()    driver.get("https://shopsearch.taobao.com/search?app=shopsearch&q=&imgfile=&commend=all&ssid=s5-e&search_type=shop&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306")    href_list = []    for i in range(12):        href_list.append(driver.find_element_by_xpath('//*[@id="shopsearchindex-hotcat"]/div/div/ul/li[%s]/a'%(i+1)).get_attribute('href'))        href_list[i] += '&sort=sale-desc'        print(href_list[i])    return href_listdef get_shop_url(store_list,start_url):    count = 0    while count <= 5:        url = start_url + '&s=%s'%(count*20)        driver.get(url)        page = driver.page_source        urls = re.findall(r'//shop\d+.taobao.com',page,re.I)        for url in urls:            url = get_total_url(url)            if url not in store_list:                store_list.append(url)        count += 1def get_img_url(shop_url):    global total_count    mutex.acquire()    driver.set_window_size(25600,14400)    driver.get(shop_url)    page_source = driver.page_source    img_urls = re.findall(r'[https:]?//gdp.alicdn.com/.*?.jpg',page_source,re.I) + re.findall(r'[https:]?//img.alicdn.com/.*?.jpg',page_source,re.I)    #for i in img2_urls:    #    img_urls.append(i)    download_path = r'C:\Users\Administrator\Pictures\test'    for count in range(len(img_urls)):        img_url = get_total_url(img_urls[count])        try:            store_name = "%s"%total_count+"_"+"%s"%count            #urllib.request.urlretrieve(img_url,download_path+"%s.jpeg"%store_name)            print("download %s.jpeg"%store_name)            #t.insert('1.0',"download %s.jpeg"%store_name)        except Exception as e:            print(e)            pass    total_count += 1    mutex.release()def get_total_url(url):    if url.startswith('//'):        url = 'https:' + url    elif url.startswith('/'):        url = 'https:/' + url    else:        url = url    return urldef print_url(store_list):    for shop_url in store_list:        print(shop_url,end = ',')def main():    href_list = get_item_href()    for i in range(len(href_list)):        start_url = href_list[i]        get_shop_url(store_list,start_url)        #print_url(store_list) #test        for shop_url in store_list:            print(shop_url)            t = threading.Thread(target = get_img_url,args = (shop_url,))            t.start()            t.join()            #get_img_url(shop_url)main()

原创粉丝点击