java调用百度搜索+Jsoup实现网络资源收集
来源:互联网 发布:苹果mac下载哪些软件 编辑:程序博客网 时间:2024/06/07 03:47
Jsoup核心jar包:Jsoup核心jar包下载地址
java代码:
抽象搜索资源的实体:Webpage
package com.sinosoft.lhresource.search.common;public class Webpage { // 标题 private String title; // 链接 private String url; // 简介 private String summary; // 正文内容 private String content; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; } public String getContent() { return content; } public void setContent(String content) { this.content = content; }}
通过资源连接获取资源内容:TextExtract.java;Tools.java
package com.sinosoft.lhresource.search.common;import java.util.ArrayList;import java.util.Arrays;import java.util.List;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class TextExtract { private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class); private static List<String> lines; private final static int blocksWidth; private static int threshold; private static String html; private static boolean flag; private static int start; private static int end; private static StringBuilder text; private static ArrayList<Integer> indexDistribution; static { lines = new ArrayList<>(); indexDistribution = new ArrayList<>(); text = new StringBuilder(); blocksWidth = 3; flag = false; /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时,只要增大此阈值即可。*/ /* 阈值增大,准确率提升,召回率下降;值变小,噪声会大,但可以保证抽到只有一句话的正文 */ threshold = 86; } public static void setthreshold(int value) { threshold = value; } /** * 抽取网页正文,不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。 * * @param _html 网页HTML字符串 * * @return 网页正文string */ public static String parse(String _html) { return parse(_html, false); } /** * 判断传入HTML,若是主题类网页,则抽取正文;否则输出<b>"unkown"</b>。 * * @param _html 网页HTML字符串 * @param _flag true进行主题类判断, 省略此参数则默认为false * * @return 网页正文string */ public static String parse(String _html, boolean _flag) { flag = _flag; html = _html; preProcess(); LOG.debug(html); return getText(); } private static void preProcess() { html = html.replaceAll("(?is)<!DOCTYPE.*?>", ""); html = html.replaceAll("(?is)<!--.*?-->", ""); // remove html comment html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript html = html.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove css html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char html = html.replaceAll("(?is)<.*?>", ""); //<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]--> } private static String getText() { lines = Arrays.asList(html.split("\n")); indexDistribution.clear(); for (int i = 0; i < lines.size() - blocksWidth; i++) { int wordsNum = 0; for (int j = i; j < i + blocksWidth; j++) { lines.set(j, lines.get(j).replaceAll("\\s+", "")); wordsNum += lines.get(j).length(); } indexDistribution.add(wordsNum); LOG.debug(wordsNum + ""); } start = -1; end = -1; boolean boolstart = false, boolend = false; text.setLength(0); for (int i = 0; i < indexDistribution.size() - 1; i++) { if (indexDistribution.get(i) > threshold && !boolstart) { if (indexDistribution.get(i + 1).intValue() != 0 || indexDistribution.get(i + 2).intValue() != 0 || indexDistribution.get(i + 3).intValue() != 0) { boolstart = true; start = i; continue; } } if (boolstart) { if (indexDistribution.get(i).intValue() == 0 || indexDistribution.get(i + 1).intValue() == 0) { end = i; boolend = true; } } StringBuilder tmp = new StringBuilder(); if (boolend) { LOG.debug(start + 1 + "\t\t" + end + 1); for (int ii = start; ii <= end; ii++) { if (lines.get(ii).length() < 5) { continue; } tmp.append(lines.get(ii)).append("\n"); } String str = tmp.toString(); LOG.debug(str); if (str.contains("Copyright") || str.contains("版权所有")) { continue; } text.append(str); boolstart = boolend = false; } } return text.toString(); }}package com.sinosoft.lhresource.search.common;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.net.URL;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class Tools { private static final Logger LOG = LoggerFactory.getLogger(Tools.class); public static String getHTMLContent(String url) { return getHTMLContent(url, "utf-8"); } public static String getHTMLContent(String url, String encoding) { try { BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream(),encoding)); StringBuilder html = new StringBuilder(); String line = reader.readLine(); while (line != null) { html.append(line).append("\n"); line = reader.readLine(); } String content = TextExtract.parse(html.toString()); return content; } catch (Exception e) { LOG.debug("解析URL失败:" + url, e); } return null; } public static void copyFile(InputStream in, File outFile){ OutputStream out = null; try { byte[] data=readAll(in); out = new FileOutputStream(outFile); out.write(data, 0, data.length); out.close(); } catch (IOException ex) { LOG.error("文件操作失败",ex); } finally { try { if(in!=null){ in.close(); } } catch (IOException ex) { LOG.error("文件操作失败",ex); } try { if(out!=null){ out.close(); } } catch (IOException ex) { LOG.error("文件操作失败",ex); } } } public static byte[] readAll(InputStream in) { ByteArrayOutputStream out = new ByteArrayOutputStream(); try { byte[] buffer = new byte[1024]; for (int n; (n = in.read(buffer)) > 0;) { out.write(buffer, 0, n); } } catch (IOException e) { LOG.error("读取失败", e); } return out.toByteArray(); }}
自定义检索接口:Searcher.java
package com.sinosoft.lhresource.search.common;import java.util.List;public interface Searcher { public List<Webpage> search(String keyword); public List<Webpage> search(String keyword, int page);}
自定义处理百度检索接口:BaiduSearcher.java
package com.sinosoft.lhresource.search.common;import java.util.List;public interface BaiduSearcher extends Searcher { /** * 新闻搜索 * @param keyword * @return */ public List<Webpage> searchNews(String keyword); /** * 新闻搜索(分页) * @param keyword * @param page * @return */ public List<Webpage> searchNews(String keyword, int page); /** * 贴吧搜索 * @param keyword * @return */ public List<Webpage> searchTieba(String keyword); /** * 贴吧搜索(分页) * @param keyword * @param page * @return */ public List<Webpage> searchTieba(String keyword, int page); /** * 知道搜索 * @param keyword * @return */ public List<Webpage> searchZhidao(String keyword); /** * 知道搜索(分页) * @param keyword * @param page * @return */ public List<Webpage> searchZhidao(String keyword, int page); /** * 文库搜索 * @param keyword * @return */ public List<Webpage> searchWenku(String keyword); /** * 文库搜索(分页) * @param keyword * @param page * @return */ public List<Webpage> searchWenku(String keyword, int page);}package com.sinosoft.lhresource.search.common;import java.util.List;public abstract class AbstractBaiduSearcher implements BaiduSearcher { /** * 新闻搜索 * @param keyword * @return */ @Override public List<Webpage> searchNews(String keyword){ return searchNews(keyword, 1); } /** * 新闻搜索(分页) * @param keyword * @param page * @return */ @Override public List<Webpage> searchNews(String keyword, int page){ throw new RuntimeException("未实现"); } /** * 贴吧搜索 * @param keyword * @return */ @Override public List<Webpage> searchTieba(String keyword){ return searchTieba(keyword, 1); } /** * 贴吧搜索(分页) * @param keyword * @param page * @return */ @Override public List<Webpage> searchTieba(String keyword, int page){ throw new RuntimeException("未实现"); } /** * 知道搜素 * @param keyword * @return */ @Override public List<Webpage> searchZhidao(String keyword){ return searchZhidao(keyword, 1); } /** * 知道搜索(分页) * @param keyword * @param page * @return */ @Override public List<Webpage> searchZhidao(String keyword, int page){ throw new RuntimeException("未实现"); } /** * 文库搜索 * @param keyword * @return */ @Override public List<Webpage> searchWenku(String keyword){ return searchWenku(keyword, 1); } /** * 文库搜索(分页) * @param keyword * @param page * @return */ @Override public List<Webpage> searchWenku(String keyword, int page){ throw new RuntimeException("未实现"); }}
百度搜索+Jsoup实现资源收集:JSoupBaiduSearcher.java
package com.sinosoft.lhresource.search.common;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class JSoupBaiduSearcher extends AbstractBaiduSearcher { private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class); @Override public List<Webpage> search(String keyword) { return search(keyword, 1); } @Override public List<Webpage> search(String keyword, int page) { int pageSize = 10; //百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数 //如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;// SearchResult searchResult = new SearchResult();// searchResult.setPage(page); List<Webpage> webpages = new ArrayList<>(); try { Document document = Jsoup.connect(url).get(); //获取搜索结果数目 int total = getBaiduSearchResultCount(document);// searchResult.setTotal(total); int len = 10; if (total < 1) { return null; } //如果搜索到的结果不足一页 if (total < 10) { len = total; } for (int i = 0; i < len; i++) { String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a"; String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract"; LOG.debug("titleCssQuery:" + titleCssQuery); LOG.debug("summaryCssQuery:" + summaryCssQuery); Element titleElement = document.select(titleCssQuery).first(); String href = ""; String titleText = ""; if(titleElement != null){ titleText = titleElement.text(); href = titleElement.attr("href"); }else{ //处理百度百科 titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a"; summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p"; LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery); LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery); titleElement = document.select(titleCssQuery).first(); if(titleElement != null){ titleText = titleElement.text(); href = titleElement.attr("href"); } } LOG.debug(titleText); Element summaryElement = document.select(summaryCssQuery).first(); //处理百度知道 if(summaryElement == null){ summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font"); LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery); summaryElement = document.select(summaryCssQuery).first(); } String summaryText = ""; if(summaryElement != null){ summaryText = summaryElement.text(); } LOG.debug(summaryText); if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) { Webpage webpage = new Webpage(); webpage.setTitle(titleText); webpage.setUrl(href); webpage.setSummary(summaryText); /*if (href != null) { String content = Tools.getHTMLContent(href); webpage.setContent(content); } else { LOG.info("页面正确提取失败"); }*/ webpages.add(webpage); } else { LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText); } } } catch (IOException ex) { LOG.error("搜索出错",ex); }// searchResult.setWebpages(webpages); return webpages; } /** * 获取百度搜索结果数 * 获取如下文本并解析数字: * 百度为您找到相关结果约13,200个 * @param document 文档 * @return 结果数 */ private int getBaiduSearchResultCount(Document document){ String cssQuery = "html body div div div div.nums"; LOG.debug("total cssQuery: " + cssQuery); Element totalElement = document.select(cssQuery).first(); String totalText = totalElement.text(); LOG.info("搜索结果文本:" + totalText); String regEx="[^0-9]"; Pattern pattern = Pattern.compile(regEx); Matcher matcher = pattern.matcher(totalText); totalText = matcher.replaceAll(""); int total = Integer.parseInt(totalText); LOG.info("搜索结果数:" + total); return total; } public static void main(String[] args) { Searcher searcher = new JSoupBaiduSearcher(); List<Webpage> webpages = searcher.search("六扇门",2); if (webpages != null) { int i = 2; LOG.info("搜索结果 当前第 " + 1 + " 页,页面大小为:" + webpages.size() + " 共有结果数:" + webpages.size()); for (Webpage webpage : webpages) { LOG.info("搜索结果 " + (i++) + " :"); LOG.info("标题:" + webpage.getTitle()); LOG.info("URL:" + webpage.getUrl()); LOG.info("摘要:" + webpage.getSummary()); LOG.info("正文:" + webpage.getContent()); LOG.info(""); } } else { LOG.error("没有搜索到结果"); } }}
0 0
- java调用百度搜索+Jsoup实现网络资源收集
- java jsoup调用
- 网络资源收集
- 网络资源搜索爬虫(python 3.4.1实现)
- c#调用百度搜索
- 百度搜索接口调用
- C#实现百度地图附近搜索&调用JavaScript函数
- Jsoup 与 httpClient 获取网络资源
- java天气预报调用百度接口实现
- java实现百度统计api调用
- 采集baidu搜索信息的java源代码实现(使用了htmlunit和Jsoup)
- 实现百度搜索效果
- 百度搜索下拉实现
- js实现百度搜索
- 调用百度搜索接口查询
- 调用百度搜索接口查询
- 音频技术网络资源收集
- 开源GIS网络资源收集
- 在Canvas中利用Path绘制基本图形
- 从ViewController初始化一直谈到强制横屏
- django windos环境下配置成功但无法创建文件
- ARM MMU工作原理剖析
- 浅谈卡尔曼滤波
- java调用百度搜索+Jsoup实现网络资源收集
- [转]指尖下的js ——多触式web前端开发之一:对于Touch的处理
- 类 SimpleDateFormat
- [从头读历史] 第278节 诗经 曹风
- CentOS镜像使用帮助
- 银联接口调试
- unity UGUI动态字体显示模糊
- 统计理论
- Volley框架解析