爬取动态的网页。

来源:互联网 发布:中昌数据什么时候开盘 编辑:程序博客网 时间:2024/05/18 00:58
#coding=utf-8#import webdriver from selenium#要安装 pip install seleniumimport re#yum install nodejs   -->  npm install -g cnpm --registry=https://registry.npm.taobao.org#npm install phantomjs-prebuilt@2.1.14 --ignore-scripts -> pip install phantomjsfrom selenium import webdriver# load PhantomJS drivercur_url="http://bj.news.163.com/"#在ubuntu  下 which phantomjsdriver = webdriver.PhantomJS('/usr/bin/phantomjs')# set window size, better to fit the whole page in order to# avoid dynamically loading datadriver.set_window_size(1280, 2400) # optional# data page contentdriver.get(cur_url)# use page_source to get html contentcontent = driver.page_sourceprint content# driver.find_element_by_class_name()# data_time=re.findall('http:\/\/(.+\.){1,}163.com\/\d{2}\/\d{4}\/\d{2}\/.+\.html',content)# print data_timefrom bs4 import BeautifulSoupsoup = BeautifulSoup(content, 'lxml')urls=[]news_content=[]for link in soup.select('div.na_detail  > div.news_title > h3 > a'):    urls.append(link.get('href'))    news_content.append(link.text)print urlsfor i in news_content:       print iprint len(news_content)print len(news_content)

原创粉丝点击