fitment

来源:互联网 发布:windows rundll32.exe 编辑:程序博客网 时间:2024/05/17 04:53

1

import loggingimport randomimport threadingimport urllib.parseimport urllib.parseimport urllib.requestfrom queue import Queueimport pymysqlfrom bs4 import BeautifulSoupimport timeimport  reimport csvclass Spider():    def randHeader(self):        head_connection = ['Keep-Alive', 'close']        head_accept = ['text/html, application/xhtml+xml, */*']        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']        header = {            'Connection': head_connection[0],            'Accept': head_accept[0],            'Accept-Language': head_accept_language[1],            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]        }        return header    def getBeautifulSoup(self,ebayno):        url_1 = 'http://frame.ebay.com/ws/eBayISAPI.dll?GetFitmentData&rand=147945705603&site=100&vs=0&req=2&cid=33706&item=' + str(            ebayno).strip() + '&ct=20&pn=FitmentComments%7CYear%7CMake%7CModel%7CTrim%7CEngine&page=1000&cb=jQuery1709105713909136433_1479456959821&_=1479457056036'        req = urllib.request.Request(url= url_1 , headers=self.randHeader())        webpage = urllib.request.urlopen(req)        html = webpage.read()        soup = BeautifulSoup(html, 'html.parser')        return  soup    def getFitment(self,ebayno):        out = open(str(ebayno)+"_fitmentcsv.csv", "w",newline="")        csv_writer = csv.writer(out)        csv_writer.writerow(['fitmentcomment','year','make','model','trim','engine','ebayno'])        reg1 = r'"Year":\["(.*?)"\]'        reg2 = r'"Make":\["(.*?)"\]'        reg3 = r'"Model":\["(.*?)"\]'        reg4 = r'"FitmentComments":\["(.*?)"\]'        reg5 = r'"Engine":\["(.*?)"\]'        reg6 = r'"Trim":\["(.*?)"\]'        soup = self.getBeautifulSoup(ebayno)        s = str(soup)        years = re.findall(reg1, s)        makes = re.findall(reg2, s)        models = re.findall(reg3, s)        fcomments = re.findall(reg4, s)        engine = re.findall(reg5, s)        trim = re.findall(reg6, s)        for i in range(len(years)):            fcomments[i] = str(fcomments[i]).replace(",", ";")            csv_writer.writerow([fcomments[i],years[i],makes[i],models[i],trim[i],engine[i],ebayno])class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类    def __init__(self, queue):  #子类特有属性, queue        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"        logging.basicConfig(level=logging.INFO, format=FORMAT)        threading.Thread.__init__(self)        self.queue = queue        self.spider = Spider()  #子类特有属性spider, 并初始化,将实例用作属性    def run(self):        while True:            success = True            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item            try:                self.spider.getFitment(item) #调用实例spider的方法getDataById(item)            except :                success = False            if not success :                self.queue.put(item)            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号class SpiderJob():    def __init__(self , size , qs):        self.size = size  # 将形参size的值存储到属性变量size中        self.qs = qs    def work(self):        toSpiderQueue = Queue() #创建一个Queue队列对象        for i in range(self.size):            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中            t.setDaemon(True)            t.start()        for q in self.qs:            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作





2

from  fitment_ebay import SpiderJob #从一个模块中导入类if __name__ == '__main__':    qs = []    with open("test.txt") as fp:        for line in fp:            qs.append(str(line.strip()))    # qs = ["152227221092",    #       "131476620231",    #       "152518348555",    #       "112224414946"]    n = len(qs)    for i in range(n):        Job = SpiderJob(8, qs)        Job.work()




原创粉丝点击