Java实现爬取网页数据:PhantomJS+Webdriver

来源:互联网 发布:ubuntu lamp环境搭建 编辑:程序博客网 时间:2024/06/05 05:15

   本文根据工作中爬取数据需要所做工作整理而来。最初我使用了HttpClient+Jsoup,然后这种最简单的方式只能得到普通的静态页面数据以及暴露在浏览器F12调试窗口中的可见URL的数据采集,对于一些需要模仿浏览器行为比如点击事件,比如页面采用了JS框架进行重新布局的就无能为力了。因此,对于此类情况,最后经过摸索,得到了这个比较好一点的实践方式。下面废话不多说,来一个具体实践:抓取点击打开链接https://www.sosobtc.com/  网页上的数据。


第一步:创建Maven工程:mycrawler

第二步:导入Maven依赖:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>  <groupId>com.szzc.crawler</groupId>  <artifactId>mycrawler</artifactId>  <version>0.0.1-SNAPSHOT</version>  <dependencies>    <dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId><version>3.3.2</version></dependency>    <dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.8.1</version></dependency>  <dependency>  <groupId>org.apache.httpcomponents</groupId>  <artifactId>httpclient</artifactId>  <version>4.3.5</version>  </dependency>    <dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>4.2</version>      <scope>test</scope>    </dependency>    <dependency>        <groupId>org.seleniumhq.selenium</groupId>         <artifactId>selenium-java</artifactId>         <version>2.53.0</version>    </dependency>    <dependency>        <groupId>com.opera</groupId>        <artifactId>operadriver</artifactId>    </dependency>    <dependency>         <groupId>org.apache.commons</groupId>         <artifactId>commons-exec</artifactId>         <version>1.3</version>    </dependency>    <dependency>        <groupId>log4j</groupId>        <artifactId>log4j</artifactId>        <version>1.2.17</version>    </dependency>
      <dependency>    <groupId>com.github.detro</groupId>    <artifactId>phantomjsdriver</artifactId>    <version>1.2.0</version></dependency>  </dependencies>  <dependencyManagement>        <dependencies>            <dependency>                <groupId>com.opera</groupId>                <artifactId>operadriver</artifactId>                <version>0.16</version>                <exclusions>                    <exclusion>                        <groupId>org.seleniumhq.selenium</groupId>                        <artifactId>selenium-remote-driver</artifactId>                    </exclusion>                </exclusions>            </dependency>        </dependencies>    </dependencyManagement>  <build>  <plugins>  <plugin>  <groupId>org.apache.maven.plugins</groupId>  <artifactId>maven-compiler-plugin</artifactId>  <version>3.3</version>  </plugin>  </plugins>  </build></project>

第三步:封装的实体类CoinData:

package com.szzc;public class CoinData {private Integer rowId;private String marketName;//交易市场private String CurrentPrice;//最新价格private String platformPrice;//平台价格private String highestPrice;//最高价private String lowestPrice;//最低价private String upsAndDowns;//涨跌private String increment;//涨幅private String trading;//成交量public Integer getRowId() {return rowId;}public void setRowId(Integer rowId) {this.rowId = rowId;}public String getMarketName() {return marketName;}public void setMarketName(String marketName) {this.marketName = marketName;}public String getCurrentPrice() {return CurrentPrice;}public void setCurrentPrice(String currentPrice) {CurrentPrice = currentPrice;}public String getPlatformPrice() {return platformPrice;}public void setPlatformPrice(String platformPrice) {this.platformPrice = platformPrice;}public String getHighestPrice() {return highestPrice;}public void setHighestPrice(String highestPrice) {this.highestPrice = highestPrice;}public String getLowestPrice() {return lowestPrice;}public void setLowestPrice(String lowestPrice) {this.lowestPrice = lowestPrice;}public String getUpsAndDowns() {return upsAndDowns;}public void setUpsAndDowns(String upsAndDowns) {this.upsAndDowns = upsAndDowns;}public String getIncrement() {return increment;}public void setIncrement(String increment) {this.increment = increment;}public String getTrading() {return trading;}public void setTrading(String trading) {this.trading = trading;}@Overridepublic String toString() {return "CoinData [rowId=" + rowId + ", marketName=" + marketName + ", CurrentPrice=" + CurrentPrice+ ", platformPrice=" + platformPrice + ", highestPrice=" + highestPrice + ", lowestPrice=" + lowestPrice+ ", upsAndDowns=" + upsAndDowns + ", increment=" + increment + ", trading=" + trading + "]";}}
第四步:抓取数据的Main方法所在的类:

package com.szzc;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.openqa.selenium.By;   import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.chrome.ChromeOptions;   public class FirstTest {public static final String TR = "tr";public static final String TD = "td";public static Integer ROWID = 1;private static String[] tableDiv = null;private static String[] liIds = null;static {tableDiv = new String[4];tableDiv[0] = "default_market_tabs-pane-btc";tableDiv[1] = "default_market_tabs-pane-ltc";tableDiv[2] = "default_market_tabs-pane-eth";tableDiv[3] = "default_market_tabs-pane-etc";liIds = new String[4];liIds[0] = "default_market_tabs-tab-btc";liIds[1] = "default_market_tabs-tab-ltc";liIds[2] = "default_market_tabs-tab-eth";liIds[3] = "default_market_tabs-tab-etc";}public static void main(String[] args) throws Exception {    //加载Chrome的驱动并打开浏览器   System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");  ChromeOptions options = new ChromeOptions();options.addArguments("--start-maximized", "allow-running-insecure-content", "--test-type");WebDriver driver = new ChromeDriver(options);          //打开sosobtc.com页面    driver.get("https://www.sosobtc.com/"); //给浏览器初始化页面响应时间Thread.sleep(5000);  //定义一个Map来存储获取到的四个币种的数据Map<String,List<CoinData>> data = new HashMap<>();String[] coinName = {"btc","ltc","eth","etc"};//依次点击页面的li标签,并获取数据for (int i = 0; i < liIds.length; i++) {List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);data.put(coinName[i], coidDataList);}for (String coinname : data.keySet()) {List<CoinData> list = data.get(coinname);for (CoinData coinData : list) {System.out.println(coinData);}}//关闭浏览器driver.quit();}/** *  * @Description: * @param driver * @param liId 切换数据表格的li标签的id * @param id 存储数据的div的id * @throws Exception * @version 1.0 * @return  * @time 2017年7月9日下午9:28:20 */public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {//点击切换li标签来显式不同币种的数据driver.findElement(By.id(liId)).click();//给数据响应的时间Thread.sleep(500L);//获取存储数据的table所在的divWebElement div = driver.findElement(By.id(id));//获得所有的行对象List<WebElement> trs = div.findElements(By.tagName(TR));//定义一个list来存储数据,每个元素代表一行List<CoinData> coinDataList = new ArrayList<>();for (WebElement tr : trs) {//获取一个列对象列表List<WebElement> tds = tr.findElements(By.tagName(TD));//获取的列对象集合不为空时,开始封装对象if (tds != null && tds.size() > 0) {CoinData coinData = new CoinData();coinData.setRowId(ROWID++);coinData.setMarketName(tds.get(0).getText());coinData.setCurrentPrice(tds.get(1).getText());coinData.setPlatformPrice(tds.get(2).getText());coinData.setHighestPrice(tds.get(3).getText());coinData.setLowestPrice(tds.get(4).getText());coinData.setUpsAndDowns(tds.get(5).getText());coinData.setIncrement(tds.get(6).getText());coinData.setTrading(tds.get(7).getText());coinDataList.add(coinData);}}//切换币种时,重新从第一个市场名字开始计数ROWID = 1;return coinDataList;}}


现在可以抓取到页面的不同标签下的数据了。但是不完美的是,每次运行程序还会有一个浏览器的窗口弹出来比较讨厌。我们可以使用PhantomJS来实现无界面的浏览器效果:

实现抓取的Main函数所在的类:

package com.szzc;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.openqa.selenium.By;   import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import org.openqa.selenium.chrome.ChromeDriver;import org.openqa.selenium.chrome.ChromeOptions;import org.openqa.selenium.phantomjs.PhantomJSDriver;import org.openqa.selenium.remote.DesiredCapabilities;   public class SecondTest {public static final String TR = "tr";public static final String TD = "td";public static Integer ROWID = 1;private static String[] tableDiv = null;private static String[] liIds = null;static {tableDiv = new String[4];tableDiv[0] = "default_market_tabs-pane-btc";tableDiv[1] = "default_market_tabs-pane-ltc";tableDiv[2] = "default_market_tabs-pane-eth";tableDiv[3] = "default_market_tabs-pane-etc";liIds = new String[4];liIds[0] = "default_market_tabs-tab-btc";liIds[1] = "default_market_tabs-tab-ltc";liIds[2] = "default_market_tabs-tab-eth";liIds[3] = "default_market_tabs-tab-etc";}public static void main(String[] args) throws Exception {    //加载Chrome的驱动并打开浏览器   //System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");System.setProperty("phantomjs.binary.path", "/usr/bin/phantomjs");System.setProperty("phantomjs.binary.path", "./phantomjs/win/phantomjs.exe");DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();//此处可以设置一些desiredCapabilities的属性(浏览器的头信息)WebDriver driver = new PhantomJSDriver(desiredCapabilities);//打开sosobtc.com页面    driver.get("https://www.sosobtc.com/"); //给浏览器初始化页面响应时间Thread.sleep(5000);  //定义一个Map来存储获取到的四个币种的数据Map<String,List<CoinData>> data = new HashMap<>();String[] coinName = {"btc","ltc","eth","etc"};//依次点击页面的li标签,并获取数据for (int i = 0; i < liIds.length; i++) {List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);data.put(coinName[i], coidDataList);}for (String coinname : data.keySet()) {List<CoinData> list = data.get(coinname);for (CoinData coinData : list) {System.out.println(coinData);}}//关闭浏览器driver.quit();}/** *  * @Description: * @param driver * @param liId 切换数据表格的li标签的id * @param id 存储数据的div的id * @throws Exception * @version 1.0 * @return  * @time 2017年7月9日下午9:28:20 */public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {//点击切换li标签来显式不同币种的数据driver.findElement(By.id(liId)).click();//给数据响应的时间Thread.sleep(500L);//获取存储数据的table所在的divWebElement div = driver.findElement(By.id(id));//获得所有的行对象List<WebElement> trs = div.findElements(By.tagName(TR));//定义一个list来存储数据,每个元素代表一行List<CoinData> coinDataList = new ArrayList<>();for (WebElement tr : trs) {//获取一个列对象列表List<WebElement> tds = tr.findElements(By.tagName(TD));//获取的列对象集合不为空时,开始封装对象if (tds != null && tds.size() > 0) {CoinData coinData = new CoinData();coinData.setRowId(ROWID++);coinData.setMarketName(tds.get(0).getText());coinData.setCurrentPrice(tds.get(1).getText());coinData.setPlatformPrice(tds.get(2).getText());coinData.setHighestPrice(tds.get(3).getText());coinData.setLowestPrice(tds.get(4).getText());coinData.setUpsAndDowns(tds.get(5).getText());coinData.setIncrement(tds.get(6).getText());coinData.setTrading(tds.get(7).getText());coinDataList.add(coinData);}}//切换币种时,重新从第一个市场名字开始计数ROWID = 1;return coinDataList;}}

至此我们已经可以完美的模仿一个浏览器的行为,来简单抓取一些网页的数据了。


原创粉丝点击