调用火狐浏览器模拟天猫搜索并遍历一百页商品(速度较慢,但不会遇到反爬机制和验证码)

来源:互联网 发布:js计算时间差 编辑:程序博客网 时间:2024/05/17 22:49
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import hashlib
import random
import urllib
from time import ctime,sleep

class ScrapyPages:
    productLinks = []
    #待处理的链接
    nextLinks = []

    #打开浏览器驱动,默认为火狐
    def openPage(self, url):
        driver = webdriver.Firefox()
        driver.get(url)
        return driver

    #处理页面中的a标签:
    #1、将a标签存入相应的文件中
    #2、将其更新如links中        

    def productHref(self, anchors):
        hrefs = []
      
        for anchor in anchors:
            try:
                href = anchor.get_attribute('href').encode('utf-8')
                if href in hrefs:
                    continue
                else:
                    if href.find('//detail.tmall.com/item.htm?')!=-1:
                        print href
    
                        hrefs += [href]
       
            except AttributeError:
                pass
            except selenium.common.exceptions.StaleElementReferenceException:
                pass
        print len(hrefs)
        return hrefs
    def nextHref(self, anchors):
        hrefs = []
        for anchor in anchors:
            try:
                href = anchor.get_attribute("href").encode('utf-8')
                print href
                hrefs += [href]
       
            except AttributeError:
                pass
            except selenium.common.exceptions.StaleElementReferenceException:
                pass
        return hrefs          
    #爬取网页的入口函数
    def scrapy(self, initURL):
        self.nextLinks += [initURL]
        while len(self.nextLinks) != 0:
            pos=len(self.nextLinks) - 1
            link=self.nextLinks[pos]
            del self.nextLinks[pos]
            driver = self.openPage(link)
            atag1 = driver.find_elements_by_tag_name("a")
            atag2= driver.find_elements_by_class_name("ui-page-next")
            self.productLinks=self.productHref( atag1)
            self.nextLinks=self.nextHref(atag2)
            driver.quit()
            print 'ok %s',ctime()
            
if __name__ == '__main__':  
    url1='http://list.tmall.com/search_product.htm?'
    key=raw_input('please input key')
    key=key.decode('utf-8').encode('gbk')
    key =urllib.quote(key )
    url3=url1+'q='+key+'&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'    
    pageScrapy = ScrapyPages()
    pageScrapy.scrapy(url3)
0 0