java调用百度搜索+Jsoup实现网络资源收集

来源:互联网 发布:苹果mac下载哪些软件 编辑:程序博客网 时间:2024/06/07 03:47

Jsoup核心jar包:Jsoup核心jar包下载地址
java代码:
抽象搜索资源的实体:Webpage

package com.sinosoft.lhresource.search.common;public class Webpage {    // 标题    private String title;    // 链接    private String url;    // 简介    private String summary;    // 正文内容    private String content;    public String getTitle() {        return title;    }    public void setTitle(String title) {        this.title = title;    }    public String getUrl() {        return url;    }    public void setUrl(String url) {        this.url = url;    }    public String getSummary() {        return summary;    }    public void setSummary(String summary) {        this.summary = summary;    }    public String getContent() {        return content;    }    public void setContent(String content) {        this.content = content;    }}

通过资源连接获取资源内容:TextExtract.java;Tools.java

package com.sinosoft.lhresource.search.common;import java.util.ArrayList;import java.util.Arrays;import java.util.List;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class TextExtract {    private static final Logger LOG = LoggerFactory.getLogger(TextExtract.class);    private static List<String> lines;    private final static int blocksWidth;    private static int threshold;    private static String html;    private static boolean flag;    private static int start;    private static int end;    private static StringBuilder text;    private static ArrayList<Integer> indexDistribution;    static {        lines = new ArrayList<>();        indexDistribution = new ArrayList<>();        text = new StringBuilder();        blocksWidth = 3;        flag = false;        /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时,只要增大此阈值即可。*/        /* 阈值增大,准确率提升,召回率下降;值变小,噪声会大,但可以保证抽到只有一句话的正文 */        threshold = 86;    }    public static void setthreshold(int value) {        threshold = value;    }    /**     * 抽取网页正文,不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。     *     * @param _html 网页HTML字符串     *     * @return 网页正文string     */    public static String parse(String _html) {        return parse(_html, false);    }    /**     * 判断传入HTML,若是主题类网页,则抽取正文;否则输出<b>"unkown"</b>。     *     * @param _html 网页HTML字符串     * @param _flag true进行主题类判断, 省略此参数则默认为false     *     * @return 网页正文string     */    public static String parse(String _html, boolean _flag) {        flag = _flag;        html = _html;        preProcess();        LOG.debug(html);        return getText();    }    private static void preProcess() {        html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");        html = html.replaceAll("(?is)<!--.*?-->", "");              // remove html comment        html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript        html = html.replaceAll("(?is)<style.*?>.*?</style>", "");   // remove css        html = html.replaceAll("&.{2,5};|&#.{2,5};", " ");          // remove special char        html = html.replaceAll("(?is)<.*?>", "");        //<!--[if !IE]>|xGv00|9900d21eb16fa4350a3001b3974a9415<![endif]-->     }    private static String getText() {        lines = Arrays.asList(html.split("\n"));        indexDistribution.clear();        for (int i = 0; i < lines.size() - blocksWidth; i++) {            int wordsNum = 0;            for (int j = i; j < i + blocksWidth; j++) {                lines.set(j, lines.get(j).replaceAll("\\s+", ""));                wordsNum += lines.get(j).length();            }            indexDistribution.add(wordsNum);            LOG.debug(wordsNum + "");        }        start = -1;        end = -1;        boolean boolstart = false, boolend = false;        text.setLength(0);        for (int i = 0; i < indexDistribution.size() - 1; i++) {            if (indexDistribution.get(i) > threshold && !boolstart) {                if (indexDistribution.get(i + 1).intValue() != 0                        || indexDistribution.get(i + 2).intValue() != 0                        || indexDistribution.get(i + 3).intValue() != 0) {                    boolstart = true;                    start = i;                    continue;                }            }            if (boolstart) {                if (indexDistribution.get(i).intValue() == 0                        || indexDistribution.get(i + 1).intValue() == 0) {                    end = i;                    boolend = true;                }            }            StringBuilder tmp = new StringBuilder();            if (boolend) {                LOG.debug(start + 1 + "\t\t" + end + 1);                for (int ii = start; ii <= end; ii++) {                    if (lines.get(ii).length() < 5) {                        continue;                    }                    tmp.append(lines.get(ii)).append("\n");                }                String str = tmp.toString();                LOG.debug(str);                if (str.contains("Copyright") || str.contains("版权所有")) {                    continue;                }                text.append(str);                boolstart = boolend = false;            }        }        return text.toString();    }}package com.sinosoft.lhresource.search.common;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.net.URL;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class Tools {     private static final Logger LOG = LoggerFactory.getLogger(Tools.class);        public static String getHTMLContent(String url) {            return getHTMLContent(url, "utf-8");        }        public static String getHTMLContent(String url, String encoding) {            try {                BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(url).openStream(),encoding));                StringBuilder html = new StringBuilder();                String line = reader.readLine();                while (line != null) {                    html.append(line).append("\n");                    line = reader.readLine();                }                String content = TextExtract.parse(html.toString());                return content;            } catch (Exception e) {                LOG.debug("解析URL失败:" + url, e);            }            return null;        }        public static void copyFile(InputStream in, File outFile){            OutputStream out = null;            try {                byte[] data=readAll(in);                out = new FileOutputStream(outFile);                out.write(data, 0, data.length);                out.close();            } catch (IOException ex) {                LOG.error("文件操作失败",ex);            } finally {                try {                    if(in!=null){                        in.close();                    }                } catch (IOException ex) {                 LOG.error("文件操作失败",ex);                }                try {                    if(out!=null){                        out.close();                    }                } catch (IOException ex) {                 LOG.error("文件操作失败",ex);                }            }        }        public static byte[] readAll(InputStream in) {            ByteArrayOutputStream out = new ByteArrayOutputStream();            try {                byte[] buffer = new byte[1024];                for (int n; (n = in.read(buffer)) > 0;) {                    out.write(buffer, 0, n);                }            } catch (IOException e) {                LOG.error("读取失败", e);            }            return out.toByteArray();        }}

自定义检索接口:Searcher.java

package com.sinosoft.lhresource.search.common;import java.util.List;public interface Searcher {     public List<Webpage> search(String keyword);     public List<Webpage> search(String keyword, int page);}

自定义处理百度检索接口:BaiduSearcher.java

package com.sinosoft.lhresource.search.common;import java.util.List;public interface BaiduSearcher extends Searcher {    /**     * 新闻搜索     * @param keyword     * @return      */    public List<Webpage> searchNews(String keyword);    /**     * 新闻搜索(分页)     * @param keyword     * @param page     * @return      */    public List<Webpage> searchNews(String keyword, int page);    /**     * 贴吧搜索     * @param keyword     * @return      */    public List<Webpage> searchTieba(String keyword);    /**     * 贴吧搜索(分页)     * @param keyword     * @param page     * @return      */    public List<Webpage> searchTieba(String keyword, int page);    /**     * 知道搜索     * @param keyword     * @return      */    public List<Webpage> searchZhidao(String keyword);    /**     * 知道搜索(分页)     * @param keyword     * @param page     * @return      */    public List<Webpage> searchZhidao(String keyword, int page);    /**     * 文库搜索     * @param keyword     * @return      */    public List<Webpage> searchWenku(String keyword);    /**     * 文库搜索(分页)     * @param keyword     * @param page     * @return      */    public List<Webpage> searchWenku(String keyword, int page);}package com.sinosoft.lhresource.search.common;import java.util.List;public abstract class AbstractBaiduSearcher implements BaiduSearcher {    /**     * 新闻搜索     * @param keyword     * @return      */    @Override    public List<Webpage> searchNews(String keyword){        return searchNews(keyword, 1);    }    /**     * 新闻搜索(分页)     * @param keyword     * @param page     * @return      */    @Override    public List<Webpage> searchNews(String keyword, int page){        throw new RuntimeException("未实现");    }    /**     * 贴吧搜索     * @param keyword     * @return      */    @Override    public List<Webpage> searchTieba(String keyword){        return searchTieba(keyword, 1);    }    /**     * 贴吧搜索(分页)     * @param keyword     * @param page     * @return      */    @Override    public List<Webpage> searchTieba(String keyword, int page){        throw new RuntimeException("未实现");    }    /**     * 知道搜素     * @param keyword     * @return      */    @Override    public List<Webpage> searchZhidao(String keyword){        return searchZhidao(keyword, 1);    }    /**     * 知道搜索(分页)     * @param keyword     * @param page     * @return      */    @Override    public List<Webpage> searchZhidao(String keyword, int page){        throw new RuntimeException("未实现");    }    /**     * 文库搜索     * @param keyword     * @return      */    @Override    public List<Webpage> searchWenku(String keyword){        return searchWenku(keyword, 1);    }    /**     * 文库搜索(分页)     * @param keyword     * @param page     * @return      */    @Override    public List<Webpage> searchWenku(String keyword, int page){        throw new RuntimeException("未实现");    }}

百度搜索+Jsoup实现资源收集:JSoupBaiduSearcher.java

package com.sinosoft.lhresource.search.common;import java.io.IOException;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class JSoupBaiduSearcher extends AbstractBaiduSearcher {     private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);        @Override        public List<Webpage> search(String keyword) {            return search(keyword, 1);        }        @Override        public List<Webpage> search(String keyword, int page) {            int pageSize = 10;            //百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数            //如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize            String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;//          SearchResult searchResult = new SearchResult();//          searchResult.setPage(page);            List<Webpage> webpages = new ArrayList<>();            try {                Document document = Jsoup.connect(url).get();                //获取搜索结果数目                int total = getBaiduSearchResultCount(document);//              searchResult.setTotal(total);                int len = 10;                if (total < 1) {                    return null;                }                //如果搜索到的结果不足一页                if (total < 10) {                    len = total;                }                for (int i = 0; i < len; i++) {                    String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";                    String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";                    LOG.debug("titleCssQuery:" + titleCssQuery);                    LOG.debug("summaryCssQuery:" + summaryCssQuery);                    Element titleElement = document.select(titleCssQuery).first();                    String href = "";                    String titleText = "";                    if(titleElement != null){                        titleText = titleElement.text();                        href = titleElement.attr("href");                    }else{                        //处理百度百科                        titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";                        summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";                        LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);                        LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);                        titleElement = document.select(titleCssQuery).first();                        if(titleElement != null){                            titleText = titleElement.text();                            href = titleElement.attr("href");                        }                    }                    LOG.debug(titleText);                    Element summaryElement = document.select(summaryCssQuery).first();                    //处理百度知道                    if(summaryElement == null){                        summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");                        LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);                        summaryElement = document.select(summaryCssQuery).first();                    }                    String summaryText = "";                    if(summaryElement != null){                        summaryText = summaryElement.text();                     }                    LOG.debug(summaryText);                                    if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {                        Webpage webpage = new Webpage();                        webpage.setTitle(titleText);                        webpage.setUrl(href);                        webpage.setSummary(summaryText);                        /*if (href != null) {                            String content = Tools.getHTMLContent(href);                            webpage.setContent(content);                        } else {                            LOG.info("页面正确提取失败");                        }*/                        webpages.add(webpage);                    } else {                        LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);                    }                }            } catch (IOException ex) {                LOG.error("搜索出错",ex);            }//          searchResult.setWebpages(webpages);            return webpages;        }        /**         * 获取百度搜索结果数         * 获取如下文本并解析数字:         * 百度为您找到相关结果约13,200个         * @param document 文档         * @return 结果数         */        private int getBaiduSearchResultCount(Document document){            String cssQuery = "html body div div div div.nums";            LOG.debug("total cssQuery: " + cssQuery);            Element totalElement = document.select(cssQuery).first();            String totalText = totalElement.text();             LOG.info("搜索结果文本:" + totalText);            String regEx="[^0-9]";               Pattern pattern = Pattern.compile(regEx);                  Matcher matcher = pattern.matcher(totalText);            totalText = matcher.replaceAll("");            int total = Integer.parseInt(totalText);            LOG.info("搜索结果数:" + total);            return total;        }        public static void main(String[] args) {            Searcher searcher = new JSoupBaiduSearcher();            List<Webpage> webpages = searcher.search("六扇门",2);            if (webpages != null) {                int i = 2;                LOG.info("搜索结果 当前第 " + 1 + " 页,页面大小为:" + webpages.size() + " 共有结果数:" + webpages.size());                for (Webpage webpage : webpages) {                    LOG.info("搜索结果 " + (i++) + " :");                    LOG.info("标题:" + webpage.getTitle());                    LOG.info("URL:" + webpage.getUrl());                    LOG.info("摘要:" + webpage.getSummary());                    LOG.info("正文:" + webpage.getContent());                    LOG.info("");                }            } else {                LOG.error("没有搜索到结果");            }        }}
0 0
原创粉丝点击