java 用jsoup爬数据

来源:互联网 发布:网络包年维护服务 编辑:程序博客网 时间:2024/05/22 06:35


其中cookie是用fidder找到: 





package com.lm.test;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;//import com.ig.common.utils.*;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;//import org.junit.Test;import java.io.BufferedWriter;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.UnsupportedEncodingException;import java.text.SimpleDateFormat;import java.util.Date;public class JsoupApiTest {public static void testGame() {//登陆//String url = "http://www.ysdqkh.com/Student/LoginPass.asp";//SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");//Connection.Response res = null;//try {//res = Jsoup//.connect(url)//.data("TstNumber", "4306231989xxxx", "TstPassword",//"400xxxx").method(Connection.Method.POST).execute();//} catch (IOException e) {//e.printStackTrace();//}//String sessionId = res.cookie("StudentId"); // StudentId=14387 Cookie://// ASPSESSIONIDSCAAQADA=LGHEAEEBIJAGGBMNJDPMLHPF;//// StudentId=14387try {//File file = new File("h:/血液内科.txt");//File file = new File("h:/心血管内科.txt");//File file = new File("h:/呼吸内科.txt");File file = new File("h:/风湿免疫内科.txt");//int ye_count=7;//7页//int ye_count=9;//心血管内科//int ye_count=6;//呼吸内科int ye_count=1;//风湿免疫内科// if file doesnt exists, then create itif (!file.exists()) {file.createNewFile();} FileOutputStream fos=new FileOutputStream(file);        OutputStreamWriter osw=new OutputStreamWriter(fos, "UTF-8");        BufferedWriter  bw=new BufferedWriter(osw);                for (int i = 1; i <= ye_count; i++) {        Document objectDoc = Jsoup.connect(//"http://www.ysdqkh.com/Student/PracticePreview.asp?o_id=4&q_id=407&pn="+i)//血液内科//"http://www.ysdqkh.com/Student/PracticePreview.asp?o_id=4&q_id=406&pn="+i)//心血管内科//"http://www.ysdqkh.com/Student/PracticePreview.asp?o_id=4&q_id=401&pn="+i)//呼吸内科 "http://www.ysdqkh.com/Student/PracticePreview.asp?o_id=4&q_id=400&pn="+i)//风湿免疫内科.cookie("StudentId", "14387").get();Elements links = objectDoc.select("tr[align]"); // 带有href属性的a元素// Elements links =// doc.select("a[href]");for (Element link : links) {System.out.println(link.text()); bw.write(link.text()+"\t\n");}}  //注意关闭的先后顺序,先打开的后关闭,后打开的先关闭        bw.close();        osw.close();        fos.close();//System.out.println(objectDoc);} catch (Exception e) {e.printStackTrace();}}public static void main(String[] args) {testGame();}}