Java实现爬取网页数据:PhantomJS+Webdriver
来源:互联网 发布:ubuntu lamp环境搭建 编辑:程序博客网 时间:2024/06/05 05:15
本文根据工作中爬取数据需要所做工作整理而来。最初我使用了HttpClient+Jsoup,然后这种最简单的方式只能得到普通的静态页面数据以及暴露在浏览器F12调试窗口中的可见URL的数据采集,对于一些需要模仿浏览器行为比如点击事件,比如页面采用了JS框架进行重新布局的就无能为力了。因此,对于此类情况,最后经过摸索,得到了这个比较好一点的实践方式。下面废话不多说,来一个具体实践:抓取点击打开链接https://www.sosobtc.com/ 网页上的数据。
第一步:创建Maven工程:mycrawler
第二步:导入Maven依赖:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.szzc.crawler</groupId> <artifactId>mycrawler</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId><version>3.3.2</version></dependency> <dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.8.1</version></dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.3.5</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.2</version> <scope>test</scope> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>2.53.0</version> </dependency> <dependency> <groupId>com.opera</groupId> <artifactId>operadriver</artifactId> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-exec</artifactId> <version>1.3</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency>
<dependency> <groupId>com.github.detro</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.2.0</version></dependency> </dependencies> <dependencyManagement> <dependencies> <dependency> <groupId>com.opera</groupId> <artifactId>operadriver</artifactId> <version>0.16</version> <exclusions> <exclusion> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-remote-driver</artifactId> </exclusion> </exclusions> </dependency> </dependencies> </dependencyManagement> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.3</version> </plugin> </plugins> </build></project>
第三步:封装的实体类CoinData:
package com.szzc;public class CoinData {private Integer rowId;private String marketName;//交易市场private String CurrentPrice;//最新价格private String platformPrice;//平台价格private String highestPrice;//最高价private String lowestPrice;//最低价private String upsAndDowns;//涨跌private String increment;//涨幅private String trading;//成交量public Integer getRowId() {return rowId;}public void setRowId(Integer rowId) {this.rowId = rowId;}public String getMarketName() {return marketName;}public void setMarketName(String marketName) {this.marketName = marketName;}public String getCurrentPrice() {return CurrentPrice;}public void setCurrentPrice(String currentPrice) {CurrentPrice = currentPrice;}public String getPlatformPrice() {return platformPrice;}public void setPlatformPrice(String platformPrice) {this.platformPrice = platformPrice;}public String getHighestPrice() {return highestPrice;}public void setHighestPrice(String highestPrice) {this.highestPrice = highestPrice;}public String getLowestPrice() {return lowestPrice;}public void setLowestPrice(String lowestPrice) {this.lowestPrice = lowestPrice;}public String getUpsAndDowns() {return upsAndDowns;}public void setUpsAndDowns(String upsAndDowns) {this.upsAndDowns = upsAndDowns;}public String getIncrement() {return increment;}public void setIncrement(String increment) {this.increment = increment;}public String getTrading() {return trading;}public void setTrading(String trading) {this.trading = trading;}@Overridepublic String toString() {return "CoinData [rowId=" + rowId + ", marketName=" + marketName + ", CurrentPrice=" + CurrentPrice+ ", platformPrice=" + platformPrice + ", highestPrice=" + highestPrice + ", lowestPrice=" + lowestPrice+ ", upsAndDowns=" + upsAndDowns + ", increment=" + increment + ", trading=" + trading + "]";}}第四步:抓取数据的Main方法所在的类:
package com.szzc;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.openqa.selenium.By; import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.chrome.ChromeOptions; public class FirstTest {public static final String TR = "tr";public static final String TD = "td";public static Integer ROWID = 1;private static String[] tableDiv = null;private static String[] liIds = null;static {tableDiv = new String[4];tableDiv[0] = "default_market_tabs-pane-btc";tableDiv[1] = "default_market_tabs-pane-ltc";tableDiv[2] = "default_market_tabs-pane-eth";tableDiv[3] = "default_market_tabs-pane-etc";liIds = new String[4];liIds[0] = "default_market_tabs-tab-btc";liIds[1] = "default_market_tabs-tab-ltc";liIds[2] = "default_market_tabs-tab-eth";liIds[3] = "default_market_tabs-tab-etc";}public static void main(String[] args) throws Exception { //加载Chrome的驱动并打开浏览器 System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe"); ChromeOptions options = new ChromeOptions();options.addArguments("--start-maximized", "allow-running-insecure-content", "--test-type");WebDriver driver = new ChromeDriver(options); //打开sosobtc.com页面 driver.get("https://www.sosobtc.com/"); //给浏览器初始化页面响应时间Thread.sleep(5000); //定义一个Map来存储获取到的四个币种的数据Map<String,List<CoinData>> data = new HashMap<>();String[] coinName = {"btc","ltc","eth","etc"};//依次点击页面的li标签,并获取数据for (int i = 0; i < liIds.length; i++) {List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);data.put(coinName[i], coidDataList);}for (String coinname : data.keySet()) {List<CoinData> list = data.get(coinname);for (CoinData coinData : list) {System.out.println(coinData);}}//关闭浏览器driver.quit();}/** * * @Description: * @param driver * @param liId 切换数据表格的li标签的id * @param id 存储数据的div的id * @throws Exception * @version 1.0 * @return * @time 2017年7月9日下午9:28:20 */public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {//点击切换li标签来显式不同币种的数据driver.findElement(By.id(liId)).click();//给数据响应的时间Thread.sleep(500L);//获取存储数据的table所在的divWebElement div = driver.findElement(By.id(id));//获得所有的行对象List<WebElement> trs = div.findElements(By.tagName(TR));//定义一个list来存储数据,每个元素代表一行List<CoinData> coinDataList = new ArrayList<>();for (WebElement tr : trs) {//获取一个列对象列表List<WebElement> tds = tr.findElements(By.tagName(TD));//获取的列对象集合不为空时,开始封装对象if (tds != null && tds.size() > 0) {CoinData coinData = new CoinData();coinData.setRowId(ROWID++);coinData.setMarketName(tds.get(0).getText());coinData.setCurrentPrice(tds.get(1).getText());coinData.setPlatformPrice(tds.get(2).getText());coinData.setHighestPrice(tds.get(3).getText());coinData.setLowestPrice(tds.get(4).getText());coinData.setUpsAndDowns(tds.get(5).getText());coinData.setIncrement(tds.get(6).getText());coinData.setTrading(tds.get(7).getText());coinDataList.add(coinData);}}//切换币种时,重新从第一个市场名字开始计数ROWID = 1;return coinDataList;}}
实现抓取的Main函数所在的类:
package com.szzc;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.openqa.selenium.By; import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.chrome.ChromeOptions;import org.openqa.selenium.phantomjs.PhantomJSDriver;import org.openqa.selenium.remote.DesiredCapabilities; public class SecondTest {public static final String TR = "tr";public static final String TD = "td";public static Integer ROWID = 1;private static String[] tableDiv = null;private static String[] liIds = null;static {tableDiv = new String[4];tableDiv[0] = "default_market_tabs-pane-btc";tableDiv[1] = "default_market_tabs-pane-ltc";tableDiv[2] = "default_market_tabs-pane-eth";tableDiv[3] = "default_market_tabs-pane-etc";liIds = new String[4];liIds[0] = "default_market_tabs-tab-btc";liIds[1] = "default_market_tabs-tab-ltc";liIds[2] = "default_market_tabs-tab-eth";liIds[3] = "default_market_tabs-tab-etc";}public static void main(String[] args) throws Exception { //加载Chrome的驱动并打开浏览器 //System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");System.setProperty("phantomjs.binary.path", "/usr/bin/phantomjs");System.setProperty("phantomjs.binary.path", "./phantomjs/win/phantomjs.exe");DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();//此处可以设置一些desiredCapabilities的属性(浏览器的头信息)WebDriver driver = new PhantomJSDriver(desiredCapabilities);//打开sosobtc.com页面 driver.get("https://www.sosobtc.com/"); //给浏览器初始化页面响应时间Thread.sleep(5000); //定义一个Map来存储获取到的四个币种的数据Map<String,List<CoinData>> data = new HashMap<>();String[] coinName = {"btc","ltc","eth","etc"};//依次点击页面的li标签,并获取数据for (int i = 0; i < liIds.length; i++) {List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);data.put(coinName[i], coidDataList);}for (String coinname : data.keySet()) {List<CoinData> list = data.get(coinname);for (CoinData coinData : list) {System.out.println(coinData);}}//关闭浏览器driver.quit();}/** * * @Description: * @param driver * @param liId 切换数据表格的li标签的id * @param id 存储数据的div的id * @throws Exception * @version 1.0 * @return * @time 2017年7月9日下午9:28:20 */public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {//点击切换li标签来显式不同币种的数据driver.findElement(By.id(liId)).click();//给数据响应的时间Thread.sleep(500L);//获取存储数据的table所在的divWebElement div = driver.findElement(By.id(id));//获得所有的行对象List<WebElement> trs = div.findElements(By.tagName(TR));//定义一个list来存储数据,每个元素代表一行List<CoinData> coinDataList = new ArrayList<>();for (WebElement tr : trs) {//获取一个列对象列表List<WebElement> tds = tr.findElements(By.tagName(TD));//获取的列对象集合不为空时,开始封装对象if (tds != null && tds.size() > 0) {CoinData coinData = new CoinData();coinData.setRowId(ROWID++);coinData.setMarketName(tds.get(0).getText());coinData.setCurrentPrice(tds.get(1).getText());coinData.setPlatformPrice(tds.get(2).getText());coinData.setHighestPrice(tds.get(3).getText());coinData.setLowestPrice(tds.get(4).getText());coinData.setUpsAndDowns(tds.get(5).getText());coinData.setIncrement(tds.get(6).getText());coinData.setTrading(tds.get(7).getText());coinDataList.add(coinData);}}//切换币种时,重新从第一个市场名字开始计数ROWID = 1;return coinDataList;}}
至此我们已经可以完美的模仿一个浏览器的行为,来简单抓取一些网页的数据了。
阅读全文
0 0
- Java实现爬取网页数据:PhantomJS+Webdriver
- Scrapy+phantomjs爬取动态网页数据
- selenium+ phantomjs实现动态网页爬取
- 使用 phantomjs 异步爬取 ajax 网页数据
- selenium+ Phantomjs爬取动态网页
- 爬虫phantomjs爬取网页中文乱码
- 使用phantomjs+java 爬取AJAX页面
- 动态网页爬取例子(WebCollector+selenium+phantomjs)
- Node.js 动态网页爬取 PhantomJS 使用入门
- 基于Python3的phantomJs+Selenium动态网页爬取技术
- python+selenium+PhantomJS爬取网页动态加载内容
- Java PhantomJs下载网页
- java Jsoup 爬取网页数据
- 学习用java基于webMagic+selenium+phantomjs实现爬虫Demo爬取淘宝搜索页面
- selenium+webdriver爬取动态网页介绍_python
- python webdriver简单实例:爬取网页图片
- 基于Python,Selenium和PhantomJS实现动态页面爬取
- Selenium+PhantomJS 爬取页面
- 自编STM32轻量级操作系统(二)------任务调度
- js全选和全不选的代码
- iOS在Xcode中重命名项目名称
- 06.19 MySQL数据库含义与安装(初级)
- 5-2 排座位 (25分)
- Java实现爬取网页数据:PhantomJS+Webdriver
- CentOS7 通过VNC 下安装oracle数据库提示DISPLAY not set. Please set the DISPLAY and try again.解决方法
- HDU5727(贪心)
- Boost库的安装
- 从零开始学前端1
- 银行调度系统
- layer.js源码分析
- mysql系统时间获取
- spring ThreadPoolTaskExecutor 实现线程阻塞