携程网旅游信息爬取
来源:互联网 发布:代挂外包源码 编辑:程序博客网 时间:2024/05/16 02:51
最近手又有点闲,闲着无聊就爬取了携程的信息,这个爬虫是我随手写的,各位可以看看。
# -*- encoding=utf-8 -*-#爬取携程旅游信息#2017/8/14from bs4 import BeautifulSoupfrom selenium import webdriverfrom selenium.common.exceptions import NoSuchElementException, TimeoutExceptionfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.ui import WebDriverWaitimport osimport csvimport timedriver = webdriver.Chrome()#打开网页def gethtml(place): try: driver.get('http://vacations.ctrip.com/') welcome = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.jewel_pop_box > div.jewel_pop > span"))) welcome.click() seaech = driver.find_element_by_css_selector('#searchpanel > div.search_wrap > div.new_search_content > div > input') check = driver.find_element_by_css_selector('#searchpanel > div.search_wrap > a.main_search_btn') seaech.send_keys(place) check.click() return driver.page_source except TimeoutException: gethtml(place)#换页def changepage(page): try: time.sleep(4) js="var q=document.documentElement.scrollTop=10000" driver.execute_script(js) p = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#ipt_page_txt'))) submit = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#ipt_page_btn'))) p.clear() p.send_keys(page) submit.click() return driver.page_source except: return changepage(page)#解析页面def checkpage(html): soup = BeautifulSoup(html,'lxml') links = soup.find_all('div',class_='main_mod product_box flag_product ') for link in links: name =link.h2.get_text() href = 'http:' + link.h2.a['href'] price = link.find('span',class_='sr_price').get_text()[:-1] try: agree = link.find('p',class_='grade').get_text()[:-1] except: agree = '' try: alp = link.find('div',class_='comment') people = alp.em.string[:-3] about = alp.a.get_text()[:-3] except: people = '' about = '' yield { '产品':name, '链接':href, '价格':price, '评分':agree, '人数':people, '点评':about, } #创建文件夹def make(place): path = 'D:/数据/' if not os.path.exists: os.makedirs(path) with open(path + place +'旅游信息.csv','w') as f: writer = csv.writer(f) writer.writerow(['产品','链接','价格','评分','人数','点评']) f.close() def save_to_csv(i): with open(path + place +'旅游信息.csv','a') as f: writer = csv.writer(f) try: writer.writerow([ i['产品'],i['链接'],i['价格'],i['评分'], i['人数'],i['点评'] ]) except: pass f.close() return save_to_csv#主函数def main(place): save = make(place) html = gethtml(place) for i in checkpage(html): save(i) print(i) for i in range(2,101): html = changepage(i) for fil in checkpage(html): save(fil) print(fil) driver.quit() #执行程序if __name__ == '__main__': place = input('请输入要查询的地点:') main(place)
阅读全文
1 0
- 携程网旅游信息爬取
- Python爬取百度旅游网站的景点
- Scrapy爬虫(2)爬取新浪旅游图片
- 土耳其旅游信息汇总
- 爬取招聘信息
- 爬取二手房信息
- 大学排名信息爬取
- python爬取网页信息
- HttpClient 登录爬取信息
- 安居客信息爬取
- urllib2 爬取网页信息
- python3爬取淘宝信息
- python3爬取淘宝信息!
- 爬取二手房信息v2
- python 爬取淘宝信息
- Python爬取国家信息
- 爬取12306站点信息
- 爬取环境信息实例
- CIFS
- Hinton Neural Networks课程笔记2d:为什么感知机的学习算法可以收敛
- CNN(卷积神经网络)在iOS上的使用
- 登录验证
- SQL之DDL,DCL,DML,TCL
- 携程网旅游信息爬取
- Steque
- idea添加svn
- Insertion Sort List Leetcode java
- OC -基础(七) 学习中。。。
- extern "C"在DLL导出函数时有什么作用?
- nsq源码分析backend_queue.go
- zabbix3.2监控Oracle11G数据脚本
- 如何在网页标题栏title加入logo图标?