抓取百度关键词排名、标题、连接、描述

来源:互联网 发布:开根号c语言 编辑:程序博客网 时间:2024/04/24 04:37

抓取百度关键词排名、标题、连接、描述

转载请标明出处

最近在做百度关键词排名的功能,发现网上资源比较这里写代码片少,于是自己琢磨了一下,写一下笔记;
本文重点在于提供思路,请不要过分依赖,本文主要靠抓取页面标签来完成,如果百度官网将页面标签修改了,请自行修改,如果遇到问题或需要修改的地方请私信我。
鸣谢:本公司SEO提供思路

package cc.test.core;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.HasParentFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.tags.Div;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.TitleTag;import org.htmlparser.util.NodeList;public class KeywordRun {    /**     *      * @Description: 获取排名数     * @param @param keyword-关键词     * @param @param url-域名     * @param @return        * @return int       */    public int getKeywordRank(String keyword, String url) {        int re = 0;        // for (int i = 0; i < 100; i += 10) {        // if (i == 0) {        re = getThisRank("http://www.baidu.com/s?wd=" + keyword, url);        //        // if (re > 0)        // break;        // } else {        // re = getThisRank("http://www.baidu.com/s?wd=" + keyword + "&pn=" + i,        // url);        // if (re > 0) {        // re += i;        // break;        // }        // }        // }        return re;    }    /**/    public int getThisRank(String resource, String url) {        int re = -1;        int n = 1;        try {            Parser myParser = new Parser(resource);            // 设置编码            myParser.setEncoding("UTF-8");            NodeFilter filter = new AndFilter(new TagNameFilter("DIV"), new HasParentFilter(new AndFilter(new TagNameFilter("DIV"), new HasAttributeFilter("id", "content_left"))));            NodeList nodeList = myParser.extractAllNodesThatMatch(filter);            for (int i = 0; i < nodeList.size(); i++) {                Div table = (Div) nodeList.elementAt(i);                Parser parser = new Parser(table.toHtml());                NodeFilter TitleFilter = new NodeClassFilter(TitleTag.class);                NodeFilter ElementIdFilter = new HasAttributeFilter("class", "c-showurl");// 获取链接                OrFilter orFilter = new OrFilter(TitleFilter, ElementIdFilter); // 做一个逻辑OR                                                                                // Filter组合                NodeList list = parser.extractAllNodesThatMatch(orFilter);                LinkTag linkTag = (LinkTag) list.elementAt(0);                String link = HttpUtil.getBaiduFinalLink(linkTag.getLink());// 最终的连接                System.out.println(link);                parser = new Parser(table.toHtml());                TitleFilter = new NodeClassFilter(TitleTag.class);                ElementIdFilter = new HasAttributeFilter("class", "t");// 获取标题                orFilter = new OrFilter(TitleFilter, ElementIdFilter); // 做一个逻辑OR                                                                        // Filter组合                list = parser.extractAllNodesThatMatch(orFilter);                Tag tag = (Tag) list.elementAt(0);                linkTag = (LinkTag) tag.getChildren().elementAt(0);                //System.out.println(splitAndFilterString(linkTag.getChildrenHTML(),linkTag.getChildrenHTML().length()));                parser = new Parser(table.toHtml());                TitleFilter = new NodeClassFilter(TitleTag.class);                ElementIdFilter = new HasAttributeFilter("class", "c-abstract");// 获取标题                orFilter = new OrFilter(TitleFilter, ElementIdFilter); // 做一个逻辑OR                // Filter组合                list = parser.extractAllNodesThatMatch(orFilter);                Div div = (Div) list.elementAt(0);                String _abstract = splitAndFilterString(div.toHtml(),div.toHtml().length());//简介                System.out.println(_abstract);            }            // System.out.println(text);        } catch (Exception e) {            e.printStackTrace();            re = -1;        }        return re;    }/**     *      * @Description: 调试     * @param @param args        * @return void       * @throws     */    public static void main(String[] args) {        KeywordRun run = new KeywordRun();        Integer re = run.getKeywordRank("百度", "www.baidu.com");    }}

需要用到的外部方法

/**     *      * 获取百度最终连接     */    public static String getBaiduFinalLink(String link){        BufferedReader in = null;        try {            URL realUrl = new URL(link);            // 打开和URL之间的连接            HttpURLConnection connection = (HttpURLConnection)realUrl.openConnection();            // 设置通用的请求属性            connection.setRequestProperty("accept", "*/*");            connection.setRequestProperty("connection", "Keep-Alive");            connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");            // 建立实际的连接            connection.connect();            in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "utf-8"));            URL host = connection.getURL();            if(connection.getResponseCode() >= 400){                return null;            }            if (in != null) {                in.close();            }            return host.getHost()+host.getPath();        } catch (Exception e) {            System.out.println("发送GET请求出现异常!" + e);            e.printStackTrace();        }        return null;    }    /**     * 删除input字符串中的html格式     *      * @param input     * @param length     * @return     */    public static String splitAndFilterString(String input, int length) {        if (input == null || input.trim().equals("")) {            return "";        }        // 去掉所有html元素,        String str = input.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");        str = str.replaceAll("[(/>)<]", "");        int len = str.length();        if (len <= length) {            return str;        } else {            str = str.substring(0, length);            str += "......";        }        return str;    }

到此结束,
已经获取出百度第一页的所有的连接、标题、简介,如果百度页面有修改,请做相对应的修改即可。

转载请标明出处

0 0