Java之网络爬虫WebCollector+selenium+phantomjs(三)

来源:互联网 发布:wps mac 编辑:程序博客网 时间:2024/06/05 12:49

经过前面两篇的学习Java之网络爬虫WebCollector+selenium+phantomjs(一)与Java之网络爬虫WebCollector+selenium+phantomjs(二)的学习后,我们来做一个小例子。我们所要做的东西为:爬取到京东列表页面,在页面上抽取出商品信息(名称、价格、评价),然后打印出抽取的商品信息。

贴出代码:

Goods.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler;/** *商品信息 * * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-21 */public class Goods {private String platform;private String url;private String name;private Float price;private Integer commit;public Goods(){}public String getPlatform() {return platform;}public void setPlatform(String platform) {this.platform = platform;}public String getName() {return name;}public void setName(String name) {this.name = name;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public Float getPrice() {return price;}public void setPrice(Float price) {this.price = price;}public Integer getCommit() {return commit;}public void setCommit(Integer commit) {this.commit = commit;}@Overridepublic String toString() {return "{platform="+platform+",url=" + url + ",name=" + name + ",price="+ price + ",commit=" + commit + "}";}}

上面类为封装的商品信息。

EECrawler.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler;import java.util.concurrent.atomic.AtomicInteger;import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;import cn.edu.hfut.dmic.webcollector.model.Links;import cn.edu.hfut.dmic.webcollector.model.Page;import cn.edu.hfut.dmic.webcollector.net.HttpRequest;import cn.edu.hfut.dmic.webcollector.net.HttpResponse;import cn.edu.hfut.dmic.webcollector.util.RegexRule;/** *电商平台爬虫 * * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-20 */public abstract class ECCrawler extends DeepCrawler {private String seedFormat;//种子格式化 protected RegexRule regexRule;public RegexRule getRegexRule() {return regexRule;}public void setRegexRule(RegexRule regexRule) {this.regexRule = regexRule;}public void addRegex(String urlRegex) {this.regexRule.addRule(urlRegex);}public ECCrawler(String crawlPath,String seedFormat ){super(crawlPath);this.seedFormat=seedFormat;this.regexRule=new RegexRule();}/*用一个自增id来生成唯一文件名*/    AtomicInteger id=new AtomicInteger(0);@Overridepublic Links visitAndGetNextLinks(Page page) {Links nextLinks = new Links();String conteType = page.getResponse().getContentType();if (conteType != null && conteType.contains("text/html")) {org.jsoup.nodes.Document doc = page.getDoc();if (doc != null)nextLinks.addAllFromDocument(page.getDoc(), regexRule);}try {visit(page, nextLinks);} catch (Exception ex) {LOG.info("Exception", ex);}return nextLinks;}@Overridepublic void start(int depth) throws Exception {addSeed();super.start(depth);}/** * add seed * * @throws Exception */private void addSeed() throws Exception{int totalPage=getTotalPage(getPage(getSeed(seedFormat, 1)));for(int page=1;page<=totalPage;page++){this.addSeed(getSeed(seedFormat, page));}}/** * 根据url获取Page实例 * * @param url * @return * @throws Exception */private Page getPage(String url) throws Exception {HttpRequest httpRequest = new HttpRequest(url);HttpResponse response = httpRequest.getResponse();Page page = new Page();page.setUrl(url);page.setHtml(response.getHtmlByCharsetDetect());page.setResponse(response);return page;}/** *获取查询商品总页数 * * @return */public abstract int getTotalPage(Page page);/** * 获取seed url * * @param seedFormat * @param page * @return */public String getSeed(String seedFormat,Object ... page){return String.format(seedFormat, page);}public abstract void visit(Page page, Links links);}

上面抽象类继承DeepCrawler,为爬取电商列表也基类,爬取列表页html(包括js动态生成的html),并且可以抽取到列表页数,允许捕获所有页商品信息。

GoodsList.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler;import java.util.ArrayList;import cn.edu.hfut.dmic.webcollector.model.Page;/** * * * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-23 */public abstract class GoodsList extends ArrayList<Goods> {/** *  */private static final long serialVersionUID = -6935403464055289581L;public abstract void addGoods(Page page);}


上面抽象类为存储商品信息的容器,继承自ArrayList,并且添加addGoods方法,用来添加商品信息到容器中。

JDCrawler.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler.jd;import cn.edu.hfut.dmic.webcollector.model.Links;import cn.edu.hfut.dmic.webcollector.model.Page;import com.zhao.crawler.ECCrawler;import com.zhao.crawler.Goods;/** *JD 爬虫 * * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-20 */public class JDCrawler extends ECCrawler {private JDGoodsList goodsList;/** * * * @param crawlPath * @param seekFormat */public JDCrawler(String crawlPath, String seekFormat) {super(crawlPath, seekFormat);goodsList=new JDGoodsList();}@Overridepublic int getTotalPage(Page page) {//Element ele=page.getDoc().select("div#J_bottomPage").select("span.p-skip >em").first().select("b").first();//return ele==null?0:Integer.parseInt(ele.text());return 1;}@Overridepublic void visit(Page page, Links links) {System.out.println("url:"+page.getUrl()+"\tlinks size:"+links.size());goodsList.addGoods(page);}public static void main(String[] args) throws Exception {JDCrawler crawler=new JDCrawler("D:/test/crawler/jd/", "http://list.jd.com/list.html?cat=1319,1523,7052&page=%s&go=0&JL=6_0_0");crawler.setThreads(100);//抓取启动线程数crawler.start(1);//层数crawler.print();}protected void print(){for(Goods g:goodsList){System.out.println(g);}}}

继承ECCrawler,实现京东平台专属爬取类。获取页码数利用浏览器审查元素,定位到页面信息即可,为了方便测试,这里只返回1。启动时我们直接爬取种子页面,所以设置为1即可,具体的抽取商品信息交给了下面JDGoodsList来处理。抓取结束后,执行一边打印函数,打印出商品信息。

JDGoodsList.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler.jd;import java.util.List;import org.openqa.selenium.By;import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import cn.edu.hfut.dmic.webcollector.model.Page;import com.zhao.crawler.Goods;import com.zhao.crawler.GoodsList;import com.zhao.crawler.util.PageUtils;import com.zhao.crawler.util.Platform;import com.zhao.crawler.util.Tools;/** *  *  * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-23 */public class JDGoodsList extends GoodsList {/** *  */private static final long serialVersionUID = -7487110223660262262L;@Overridepublic void addGoods(Page page) {WebDriver driver = null;try {driver = PageUtils.getWebDriver(page);List<WebElement> eles = driver.findElements(By.cssSelector("li.gl-item"));if (!eles.isEmpty()) {for (WebElement ele : eles) {Goods g = new Goods();g.setPlatform(Platform.JD);// 电商平台// 价格String priceStr = ele.findElement(By.className("p-price")).findElement(By.className("J_price")).findElement(By.tagName("i")).getText();if (Tools.notEmpty(priceStr)) {g.setPrice(Float.parseFloat(priceStr));} else {g.setPrice(-1f);}// 商品名g.setName(ele.findElement(By.className("p-name")).findElement(By.tagName("em")).getText());// 商品链接g.setUrl(ele.findElement(By.className("p-name")).findElement(By.tagName("a")).getAttribute("href"));// 评价String commitStr = ele.findElement(By.className("p-commit")).findElement(By.tagName("a")).getText();if (Tools.notEmpty(commitStr)) {g.setCommit(Integer.parseInt(commitStr));} else {g.setCommit(-1);}add(g);}} else {System.out.println("else is empty");}} catch (Exception e) {e.printStackTrace();} finally {if (driver != null) {driver.quit();}}}}


PageUtils.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler.util;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import org.openqa.selenium.WebDriver;import org.openqa.selenium.htmlunit.HtmlUnitDriver;import org.openqa.selenium.phantomjs.PhantomJSDriver;import com.gargoylesoftware.htmlunit.BrowserVersion;import cn.edu.hfut.dmic.webcollector.model.Page;/** *  *  * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-22 */public class PageUtils {/** * 获取webcollector 自带 htmlUnitDriver实例(模拟默认浏览器) * * @param page * @return */public static HtmlUnitDriver getDriver(Page page) {HtmlUnitDriver driver = new HtmlUnitDriver();driver.setJavascriptEnabled(true);driver.get(page.getUrl());return driver;}/** * 获取webcollector 自带htmlUnitDriver实例  * * @param page * @param browserVersion 模拟浏览器 * @return */public static HtmlUnitDriver getDriver(Page page,BrowserVersion browserVersion) {HtmlUnitDriver driver = new HtmlUnitDriver(browserVersion);driver.setJavascriptEnabled(true);driver.get(page.getUrl());return driver;}/** * 获取PhantomJsDriver(可以爬取js动态生成的html) * * @param page * @return */public static WebDriver getWebDriver(Page page) {//    WebDriver driver = new HtmlUnitDriver(true);    //    System.setProperty("webdriver.chrome.driver", "D:\\Installs\\Develop\\crawling\\chromedriver.exe");//    WebDriver driver = new ChromeDriver();        System.setProperty("phantomjs.binary.path", "D:/Program Files/phantomjs-2.0.0-windows/bin/phantomjs.exe");    WebDriver driver = new PhantomJSDriver();    driver.get(page.getUrl());    //    JavascriptExecutor js = (JavascriptExecutor) driver;//    js.executeScript("function(){}");    return driver;    }/** * 直接调用原生phantomJS(即不通过selenium) * * @param page * @return */public static String getPhantomJSDriver(Page page) {    Runtime rt = Runtime.getRuntime();    Process process = null;    try {process = rt.exec("D:/Program Files/phantomjs-2.0.0-windows/bin/phantomjs.exe" + "D:/MyEclipseWorkSpace/WebCollectorDemo/src/main/resources/parser.js " +page.getUrl().trim());InputStream in = process.getInputStream();InputStreamReader reader = new InputStreamReader(in, "UTF-8");BufferedReader br = new BufferedReader(reader);StringBuffer sbf = new StringBuffer();String tmp = "";while((tmp = br.readLine())!=null){                    sbf.append(tmp);                }return sbf.toString();} catch (IOException e) {e.printStackTrace();}        return null;    }}

获取WebDriver工具类,上篇有介绍。

Platform.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler.util;/** *电商平台标识 * * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-23 */public interface Platform {/** * 京东 */public static final String JD="JD";}

Tools.java

/* * Copyright (C) 2015 zhao * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. */package com.zhao.crawler.util;import org.apache.commons.lang3.StringUtils;/** * * * @author <a href="ls.zhaoxiangyu@gmail.com">zhao</> * @date 2015-10-23 */public class Tools {/** * 判断字符窜是否等于null、"","  ","null" *  * @param str * @return */public static boolean isEmpty(String str){return StringUtils.isBlank(str)||"null".equals(str);}/** * 判断字符窜是否不等于null、"","  ","null" *  * @param str * @return */public static boolean notEmpty(String str){return !StringUtils.isBlank(str)&&!"null".equals(str);}}
运行程序,控制台输出结果为:


ok,成功抽取商品信息。

自此,此次学习结束。源码下载地址(免费下载):WebCollectorDemo



1 0