使用jsoup爬虫抓取页面

来源：互联网发布：天之道其犹张弓乎原文编辑：程序博客网时间：2024/05/22 15:41
httpclient 和jsoup都可以实现模拟浏览器抓取页面，前者发送请求，后者解析htm标签比较强大。本例直接使用jsoup实现请求和解析。
package com.chongdong.log.test;import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.jsoup.Connection;import org.jsoup.Connection.Method;import org.jsoup.Jsoup;import org.jsoup.helper.HttpConnection.Response;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;import org.junit.Test;/** * * 类名称：JsoupTest   * 类描述：    jsoup  抓取 mitsuku聊天 信息* 创建人：zk   * 创建时间：2015-7-20 下午3:52:06   * 修改人：zk   * 修改时间：2015-7-20 下午3:52:06   * 修改备注：   * 开发进度：* @version  1.0  * */public class JsoupTest {    public  static  void  postMitSuKu(){        Map<String, String> map = new HashMap<String, String>();        /**  表单 提交的参数          *  input:how old  are  you            botid:9fa364f2fe345a10            custid:c04f62ad1e044059            faq : http://www.pandorabots.com/botmaster/en/faq#h1         * **/        map.put("input", "hi");        map.put("botid", "9fa364f2fe345a10");  //开启 可在 跳转到聊天框界面 获取  标示        map.put("custid", "c04f62ad1e044059");  //当一个空请求 即可获取 相当于 标示        Connection conn = Jsoup.connect("http://fiddle.pandorabots.com/pandora/talk-xml");    /*  conn.header("(Request-Line)", "POST /cgi-bin/login?lang=zh_CN HTTP/1.1");*/        conn.header("Accept", "*/*");        conn.header("Accept-Encoding", "gzip,deflate,sdch");        conn.header("Accept-Language", "zh-CN,zh;q=0.8");/*      conn.header("Content-Length", "58");*/        conn.header("Origin", "http://www.square-bear.co.uk");        conn.header("Pragma", "no-cache");        conn.header("Connection", "Keep-Alive");        //必须 填写  表单提交        conn.header("Content-Type", "application/x-www-form-urlencoded");        conn.header("Host", "fiddle.pandorabots.com");        conn.header("Referer", "http://www.square-bear.co.uk/mitsuku/mitsy_retro.swf");        conn.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");        try {            Response response = (Response) conn.ignoreContentType(true).method(Method.POST).data(map).execute();            //response.            String json=response.body();        System.out.println(json);        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }    }    /////////////////////////////////////////////////////////////////////////////////////////    //方案2：    通过 开发抓包工具可知  表单的提交方式  应该为http post  此处为 get  方法 不合适    /**         * 请求英文对话的网页，抓取结果         * @param url            * @return         */        private static String processLogic(String url){            String result = "";            try {                    Document document = Jsoup.connect(url).ignoreContentType(true).ignoreHttpErrors(true)                    .followRedirects(true).timeout(5000).userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0;)").get();                    Elements elements = document.select("result that");                    result = elements.text();                    System.out.println(result);            } catch (Exception e) {                e.printStackTrace();                result = "ok";            }            return result;        }    public static void main(String[] args) {        for (int i = 0; i < 100; i++) {        //  Thread  thread=new Thread();            //thread.start();            postMitSuKu();        }        /*         通过 开发抓包工具可知  表单的提交方式  应该为http post  此处为 get  方法 不合适        String url = "http://fiddle.pandorabots.com/pandora/talk-xml?input=%s&botid=9fa364f2fe345a10&custid=bbbb30debe1bc7f7";        processLogic(url);        */    }}
阅读全文
0 0