Python并行地爬取京东页面的id以及各个id的评论

来源:互联网 发布:香港买什么最划算 知乎 编辑:程序博客网 时间:2024/06/16 05:33

1、简介

前面一篇告诉了大家如何爬取京东的评论,这次我来教大家如何大规模
并行的爬取自动获取商品id以及爬取评论

2、所需模块

除了上篇博客所需模块之外,这次需要加入selenium这个模块来爬取动态页面的数据

3、代码

代码下有注释,有疑问直接在下面评论

import requestsfrom bs4 import BeautifulSoupimport refrom selenium import webdriverfrom collections import dequeimport sysimport numpy as npimport jiebaimport threadingfrom threading import current_thread,Lockfrom time import ctime ,sleepimport pymysqlimport jsonimport urllibimport mathimport queueclass MyThread(threading.Thread):    def __init__(self, funcs, args, name=''):        threading.Thread.__init__(self)        self.funcs = funcs        self.name = name        self.args = args    def run(self):        self.funcs(*self.args)def getContent(que):    while que:        try:            url = que.popleft()            print('正在爬的线程是'+current_thread().name+"爬的是"+url)            headers = {                'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *',                'Accept - Encoding': 'gzip, deflate',                'Accept - Language': 'zh - Hans - CN, zh - Hans;q = 0.5',                'User - Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/ 537.36Edge/14.14393',                'Connection': 'Keep - Alive'            }            req = requests.get(url, headers=headers)            req.encoding = 'gbk'            res = req.text            bs = BeautifulSoup(res)            bs = bs.find_all('div', class_='i-item')            total = []            for i in bs:                reg1 = i.find('dd')                reg2 = i.find('span', class_=re.compile(r'^sta'))                scores = reg2.get('class')                scores = ''.join(scores)                scores = str(scores)                if scores == 'starsa5' or scores == 'starsa4' or scores == 'starsa3':                    j = 1                #                #     j=1                #                if scores == 'starsa2' or scores == 'starsa1' or scores == 'starsa0':                    j = 0                reg3 = re.compile("<[^>]*>")                content = reg3.sub('', reg1.prettify())                print(content)                total.append((content, scores, j))                db = pymysql.connect('localhost', 'root', 'xin123456789', 'test')                db.encoding = 'utf-8'                cursor = db.cursor()                cursor.execute('set names utf8')                sql = "INSERT INTO newjd (comment,scores ) VALUES ('%s','%s') "                sql2 = "INSERT INTO test1 (title) VALUES ('%s') "                cursor.execute(sql % (content, j))                db.commit()                cursor.close()                db.close()            sleep(3)        except Exception:            print('运行出错')def main():    what = '手机'    what = urllib.parse.quote(what)    import queue    que = queue.Queue()    for i in range(10):        url = "http://search.jd.com/Search?keyword=%s&enc=utf-8&wq=%s&pvid=cf28fafa22df407d860947dac22f33fe&page="  % (what,what)        url=url + str(i)        que.put(url)    headers = {        'Accept': 'text / html, application / xhtml + xml, image / jxr, * / *',        'Accept - Encoding': 'gzip, deflate',        'Accept - Language': 'zh - Hans - CN, zh - Hans;q = 0.5',        'User - Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/ 537.36Edge/14.14393',        'Connection': 'Keep - Alive'    }    driver = webdriver.PhantomJS()    id = []    while que:        url = que.get()        driver.get(url)        driver.implicitly_wait(10)        bs = BeautifulSoup(driver.page_source)        items = bs.find_all("li", class_="gl-item")        # print(items)        for i in items:            id.append(i.get('data-sku'))            print(i.get('data-sku'))        if que.empty():            break    que1 = deque()    visited = set()    commentpeypage = 30    for i in id:        itemsummaryurl = 'http://club.jd.com/ProductPageService.aspx?method=GetCommentSummaryBySkuId&referenceId=' + str(            i)        itemsummaryresponse = urllib.request.urlopen(itemsummaryurl)        itemsummaryjson_dict = json.loads(itemsummaryresponse.read().decode('utf-8'))        commentrange = int(math.ceil(itemsummaryjson_dict.get('CommentCount')) / commentpeypage)        for j in range(commentrange):            url = 'http://club.jd.com/review/' + str(i) + '-0-' + str(j) + '-0.html'            que1.append(url)    thread=[]    for i in range(4):        t = MyThread(getContent, (que1, ), name='thread' + str(i))        thread.append(t)    for i in range(4):        thread[i].start()    for i in range(4):        thread[i].join()if __name__ =='__main__':    main()
原创粉丝点击