淘宝美食获取

来源:互联网 发布:sql getdate 格式化 编辑:程序博客网 时间:2024/05/17 09:01
代码思路非常清晰,值得借鉴。这里用到pyquert库解析,selenium库驱动浏览器,以及mongodb数据库,具体代码:
# coding=utf-8import refrom selenium import webdriverfrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom pyquery import PyQuery as pqfrom config import *import pymongoclient = pymongo.MongoClient(MONGO_URL)db = client[MONGO_DB]browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) # 调用PhantomJS,不显示Chrome窗口wait = WebDriverWait(browser, 10)browser.set_window_size(1400,900)  # 浏览器窗口大小一定要设置,不然不会出现结果def search():    print('正在搜索')    try:        browser.get('https://www.taobao.com')        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) # 注意这边是双括号,最后加上css选择器,去网页中获取select        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))        input.send_keys('KEYWORD')        submit.click()        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))        get_product()        return total.text    except TimeoutException:        return search()def next_to_page(page_number):    print('正在翻页',page_number)    try:        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')))        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))        input.clear()        input.send_keys(page_number)        submit.click()        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))        get_product()    except TimeoutException:        next_to_page(page_number)def get_product():        wait.until(EC.presence_of_element_located   ((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))    html = browser.page_source    doc = pq(html)    items = doc('#mainsrp-itemlist .items .item').items()    for item in items:        product = {            'image' : item.find('.pic .img').attr('src'),#找到这个类下的这个标签            'price' :item.find('.price').text(),            'deal_people' :item.find('.deal-cnt').text()[:-3],            'title' :item.find('.title').text(),            'shop' :item.find('.shop').text(),            'location' :item.find('.location').text()        }        print(product)        save_to_mongo(product)def save_to_mongo(result): # 存到数据库操作    try:        if db[MONGO_TABLE].insert(result):            print('存到mongodb成功',result)    except Exception:        print('存到mongodb失败',result)def main():    try:  # 浏览器可能不会正常工作,需要异常处理,避免程序中断        total = search()        total = int(re.compile('(\d+)').search(total).group(1)) # 输出共100页,我们只需要100这个数字,因此用正则表达式去匹配100这个数字        for i in range (2,total + 1):            next_to_page(i)    except Exception:        print('出错')    finally:        browser.close()if __name__ == '__main__':    main()
MongoDB需要配置的信息,这里不需要设置总页数,直接翻到最后即可。MONGO_URL = 'localhost'MONGO_DB = '淘宝美食'MONGO_TABLE = '美食'KEYWORD = '美食'SERVICE_ARGS = ['--load-images=false','--disk-cache=true']#不下载图片,并且开启缓存