使用jsoup爬虫抓取页面
来源:互联网 发布:天之道其犹张弓乎原文 编辑:程序博客网 时间:2024/05/22 15:41
httpclient 和jsoup都可以实现模拟浏览器抓取页面,前者发送请求,后者解析htm标签比较强大。本例直接使用jsoup实现请求和解析。
package com.chongdong.log.test;import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.jsoup.Connection;import org.jsoup.Connection.Method;import org.jsoup.Jsoup;import org.jsoup.helper.HttpConnection.Response;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;import org.junit.Test;/** * * 类名称:JsoupTest * 类描述: jsoup 抓取 mitsuku聊天 信息* 创建人:zk * 创建时间:2015-7-20 下午3:52:06 * 修改人:zk * 修改时间:2015-7-20 下午3:52:06 * 修改备注: * 开发进度:* @version 1.0 * */public class JsoupTest { public static void postMitSuKu(){ Map<String, String> map = new HashMap<String, String>(); /** 表单 提交的参数 * input:how old are you botid:9fa364f2fe345a10 custid:c04f62ad1e044059 faq : http://www.pandorabots.com/botmaster/en/faq#h1 * **/ map.put("input", "hi"); map.put("botid", "9fa364f2fe345a10"); map.put("custid", "c04f62ad1e044059"); Connection conn = Jsoup.connect("http://fiddle.pandorabots.com/pandora/talk-xml"); conn.header("Accept", "*/*"); conn.header("Accept-Encoding", "gzip,deflate,sdch"); conn.header("Accept-Language", "zh-CN,zh;q=0.8"); conn.header("Origin", "http://www.square-bear.co.uk"); conn.header("Pragma", "no-cache"); conn.header("Connection", "Keep-Alive"); conn.header("Content-Type", "application/x-www-form-urlencoded"); conn.header("Host", "fiddle.pandorabots.com"); conn.header("Referer", "http://www.square-bear.co.uk/mitsuku/mitsy_retro.swf"); conn.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"); try { Response response = (Response) conn.ignoreContentType(true).method(Method.POST).data(map).execute(); String json=response.body(); System.out.println(json); } catch (IOException e) { e.printStackTrace(); } } /** * 请求英文对话的网页,抓取结果 * @param url * @return */ private static String processLogic(String url){ String result = ""; try { Document document = Jsoup.connect(url).ignoreContentType(true).ignoreHttpErrors(true) .followRedirects(true).timeout(5000).userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0;)").get(); Elements elements = document.select("result that"); result = elements.text(); System.out.println(result); } catch (Exception e) { e.printStackTrace(); result = "ok"; } return result; } public static void main(String[] args) { for (int i = 0; i < 100; i++) { postMitSuKu(); } }}