使用jsoup爬取数据并导出excel文件保存

来源：互联网发布：三星淘宝旗舰店编辑：程序博客网时间：2024/05/18 23:52

/** * 规则类 *  *  */public class Rule {/** * 链接 */private String url;/** * 参数集合 */private String[] params;/** * 参数对应的值 */private String[] values;/** * 对返回的HTML，第一次过滤所用的标签，请先设置type */private String resultTagName;/** * CLASS / ID / SELECTION 设置resultTagName的类型，默认为ID */private int type = ID;/** * GET / POST 请求的类型，默认GET */private int requestMoethod = GET;public final static int GET = 0;public final static int POST = 1;public final static int CLASS = 0;public final static int ID = 1;public final static int SELECTION = 2;public Rule() {}public Rule(String url, String[] params, String[] values, String resultTagName, int type, int requestMoethod) {super();this.url = url;this.params = params;this.values = values;this.resultTagName = resultTagName;this.type = type;this.requestMoethod = requestMoethod;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public String[] getParams() {return params;}public void setParams(String[] params) {this.params = params;}public String[] getValues() {return values;}public void setValues(String[] values) {this.values = values;}public String getResultTagName() {return resultTagName;}public void setResultTagName(String resultTagName) {this.resultTagName = resultTagName;}public int getType() {return type;}public void setType(int type) {this.type = type;}public int getRequestMoethod() {return requestMoethod;}public void setRequestMoethod(int requestMoethod) {this.requestMoethod = requestMoethod;}}

import java.io.IOException;import java.net.HttpURLConnection;import java.net.URL;import java.util.ArrayList;import java.util.List;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.zhy.spider.rule.Rule;import com.zhy.spider.rule.RuleException;import com.zhy.spider.util.TextUtil;public class ExtractService {/** * @param rule * @return */public static List<List<String>> extract(Rule rule, String urlName) {// 进行对rule的必要校验validateRule(rule);List<List<String>> datas = null;try {/** * 解析rule */String url = rule.getUrl();String[] params = rule.getParams();String[] values = rule.getValues();String resultTagName = rule.getResultTagName();int type = rule.getType();int requestType = rule.getRequestMoethod();URL realUrl = new URL(url);HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();// 是否允许缓存，默认true。connection.setUseCaches(Boolean.FALSE);// 是否开启输出输入，如果是post使用true。默认是false// connection.setDoOutput(Boolean.TRUE);// connection.setDoInput(Boolean.TRUE);// 设置请求头信息connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");//connection.addRequestProperty("Connection", "close");// 设置连接主机超时（单位：毫秒）connection.setConnectTimeout(8000);// 设置从主机读取数据超时（单位：毫秒）connection.setReadTimeout(8000);// 设置Cookie// connection.addRequestProperty("Cookie","你的Cookies" );// 设置查询参数String str = "";if (params != null) {for (int i = 0; i < params.length; i++) {str = str + "&" + params[i] + "=" + values[i];}}// 设置请求类型，大小写都行，因为源码里都toUpperCase了。switch (requestType) {case Rule.GET:connection.setRequestMethod("GET");break;case Rule.POST:connection.setRequestMethod("POST");break;}// 获取页面编码//String encoding = WebEncoding.getCharset(url);// 开始请求Document doc = Jsoup.parse(connection.getInputStream(), "gb2312", url + str);// 处理返回数据Elements results = new Elements();switch (type) {case Rule.CLASS:results = doc.getElementsByClass(resultTagName);break;case Rule.ID:Element result = doc.getElementById(resultTagName);results.add(result);break;case Rule.SELECTION:results = doc.select(resultTagName);break;default:// 当resultTagName为空时默认去body标签if (TextUtil.isEmpty(resultTagName)) {results = doc.getElementsByTag("div");}}datas = LinkTypeData9998(results);if (datas == null || datas.size() == 0){results = new Elements();results = doc.getElementsByTag("title");for (Element element : results) {List<String> data = new ArrayList<String>();data.add(element.text());datas.add(data);}}} catch (IOException e) {e.printStackTrace();}return datas;}private static List<List<String>> LinkTypeData9998(Elements results) {List<List<String>> datas = new ArrayList<List<String>>();List<String> data = null;for (Element result : results) {Elements div1 = result.select("div.Bg");for (Element element : div1) {data = new ArrayList<String>();Elements h4 = element.getElementsByTag("h4");if (h4 != null && h4.size()>0){for (int i = 0; i < h4.size(); i++) {String text  = h4.get(i).text();String[] strings = text.split(" ");for (int j = 0; j < strings.length; j++) {data.add(strings[j]);}}}datas.add(data);}}return datas;}/** * 对传入的参数进行必要的校验 */private static void validateRule(Rule rule) {String url = rule.getUrl();if (TextUtil.isEmpty(url)) {throw new RuleException("url不能为空！");}if (url.startsWith("http://") || url.startsWith("https://")) {System.out.println(url);} else {throw new RuleException("url的格式不正确！");}if (rule.getParams() != null && rule.getValues() != null) {if (rule.getParams().length != rule.getValues().length) {throw new RuleException("参数的键值对个数不匹配！");}}}}

import java.io.FileOutputStream;import java.io.IOException;import java.util.ArrayList;import java.util.List;import javax.swing.JOptionPane;import com.zhy.spider.bean.LinkTypeData;import com.zhy.spider.core.ExtractService;import com.zhy.spider.core.ExtractService2;import com.zhy.spider.rule.Rule;import com.zhy.spider.util.ExcelService;import com.zhy.spider.util.WebContent;public class Test2 {public static void main(String[] args) {String fileName = "D:\\excel\\04.xls";FileOutputStream fos = null;ExcelService pd = new ExcelService();// 表头//String[] tableHeader = { "广告链接" ,"广告名称", "招商热线" , "微信", "企业网址" , "联系地址"};//ExcelService.createTableHeader("9928", tableHeader); // --->创建一个表头行Rule rule = new Rule("", null, null, null, -1, Rule.GET);List<LinkTypeData> extracts = ExtractService.extract(rule,"9998");int rowIndex = 1;try {for (LinkTypeData data : extracts) {System.out.println(data.getLinkHref());if (data.getLinkHref() != null && !"".equals(data.getLinkHref())){Rule rule2 = new Rule(data.getLinkHref(), null, null, null, -1, Rule.GET);List<List<String>> extracts2 = ExtractService2.extract(rule2,"9998");if (extracts2 != null && extracts2.size()>0){List<String> list = new ArrayList<String>();list.add(data.getLinkHref());for (List<String> list2 : extracts2) {for (int i = 0; i < list2.size(); i++) {System.out.println(list2.get(i));list.add(list2.get(i));}}ExcelService.createTableRow(list, (short) rowIndex);rowIndex++;}}System.out.println("***********************************");}fos = new FileOutputStream(fileName);pd.exportExcel(ExcelService.demoSheet, fos);JOptionPane.showMessageDialog(null, "表格已成功导出到 : " + fileName);} catch (Exception e) {JOptionPane.showMessageDialog(null, "表格导出出错，错误信息 ：" + e + "\n错误原因可能是表格已经打开。");e.printStackTrace();} finally {try {fos.close();} catch (Exception e) {e.printStackTrace();}}}}

import java.io.FileOutputStream;import java.io.IOException;import java.io.OutputStream;import java.sql.ResultSet;import java.sql.SQLException;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.swing.JOptionPane;import org.apache.poi.hssf.usermodel.HSSFCell;import org.apache.poi.hssf.usermodel.HSSFFooter;import org.apache.poi.hssf.usermodel.HSSFHeader;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hssf.usermodel.HSSFSheet;import org.apache.poi.hssf.usermodel.HSSFWorkbook;/** * * @ClassName: ExcelService* @Description:Excel* */public class ExcelService {// 创建工作本public static HSSFWorkbook demoWorkBook = new HSSFWorkbook();// 创建表public static HSSFSheet demoSheet = demoWorkBook.createSheet();// 表头的单元格个数目//public static final short cellNumber = (short) tableHeader.length;// 数据库表的列数public static final int columNumber = 2;/** * 创建表头  * @return */@SuppressWarnings("deprecation")public static void createTableHeader(String str, String[] tableHeader) {// 设置表头，从sheet中得到HSSFHeader header = demoSheet.getHeader();header.setCenter(str);// 创建一行HSSFRow headerRow = demoSheet.createRow((short) 0);for (int i = 0; i < tableHeader.length; i++) {// 创建一个单元格HSSFCell headerCell = headerRow.createCell((short) i);// headerCell.setEncoding(HSSFCell.ENCODING_UTF_16);// CellStyle cs = new CellStyle();// 设置cell的值headerCell.setCellValue(tableHeader[i]);}}/** * 创建行  * @param cells * @param rowIndex */@SuppressWarnings("deprecation")public static void createTableRow(List<String> cells, short rowIndex) {// 创建第rowIndex行HSSFRow row = demoSheet.createRow((short) rowIndex);if (cells!= null && cells.size() >0){for (short i = 0; i < cells.size(); i++) {// 创建第i个单元格HSSFCell cell = row.createCell((short) i);// cell.setEncoding(HSSFCell.ENCODING_UTF_16);cell.setCellValue(cells.get(i));}} else {// 创建第i个单元格HSSFCell cell = row.createCell((short) 1);// cell.setEncoding(HSSFCell.ENCODING_UTF_16);cell.setCellValue("-----------------------");}}public static void main(String[] args) {String fileName = "D:\\excel\\11206.xls";FileOutputStream fos = null;try {ExcelService pd = new ExcelService();ExcelService.createExcelSheeet();fos = new FileOutputStream(fileName);pd.exportExcel(demoSheet, fos);JOptionPane.showMessageDialog(null, "表格已成功导出到 : " + fileName);} catch (Exception e) {JOptionPane.showMessageDialog(null, "表格导出出错，错误信息 ：" + e + "\n错误原因可能是表格已经打开。");e.printStackTrace();} finally {try {fos.close();} catch (Exception e) {e.printStackTrace();}}}/** *创建整个Excel表  * @throws SQLException */public static void createExcelSheeet() throws Exception {//createTableHeader(); // --->创建一个表头行/*while (rs.next()) {String isme = null;List<String> list = new ArrayList<String>();//int falg = 0;for (int i = 1; i <= columNumber; i++) {if (i==3){isme = rs.getString(i);} else if (i==4){String result = Tea.decrypt(rs.getString(i), "wLSKF~$^)456Sdk");try {JSONObject body = new JSONObject(result);result = "离线消息："+body.optString("D3");} catch (Exception e) {}if (isme.equals("1")) {result = "访客：" + result.replaceAll("\r|\n", "");} else {result = "客服：" + result.replaceAll("\r|\n", "");}//falg = isValidStr(result);list.add(result);} else if (i==5){long time = Long.valueOf(rs.getString(i));list.add(TimeUtil.getFormatMMSecondString(4,time));} else {list.add(rs.getString(i));}}createTableRow(list, (short) rowIndex);rowIndex++;}*/}/** * 导出表格  *  * @param sheet * @param os * @throws IOException */public void exportExcel(HSSFSheet sheet, OutputStream os) throws IOException {sheet.setGridsPrinted(true);HSSFFooter footer = sheet.getFooter();footer.setRight("Page " + HSSFFooter.page() + " of " + HSSFFooter.numPages());demoWorkBook.write(os);}}

public class TextUtil {public static boolean isEmpty(String str) {if (str == null || str.trim().length() == 0) {return true;}return false;}}

import info.monitorenter.cpdetector.io.CodepageDetectorProxy;import info.monitorenter.cpdetector.io.JChardetFacade;import info.monitorenter.cpdetector.io.ParsingDetector;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;public class WebEncoding {private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();static {detector.add(new ParsingDetector(false));detector.add(JChardetFacade.getInstance());}/** * 测试用例 *  * @param args */public static void main(String[] args) {try {System.out.println(getCharset(""));} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}/** * @param strurl *            页面url地址,需要以 http://开始，例：http://www.pujia.com * @return * @throws IOException */public static String getCharset(String strurl) throws IOException {// 定义URL对象URL url = new URL(strurl);// 获取http连接对象HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();;urlConnection.connect();// 网页编码String strencoding = null;/** * 首先根据header信息，判断页面编码 */// map存放的是header信息(url页面的头信息)Map<String, List<String>> map = urlConnection.getHeaderFields();Set<String> keys = map.keySet();Iterator<String> iterator = keys.iterator();// 遍历,查找字符编码String key = null;String tmp = null;while (iterator.hasNext()) {key = iterator.next();tmp = map.get(key).toString().toLowerCase();// 获取content-type charsetif (key != null && key.equals("Content-Type")) {int m = tmp.indexOf("charset=");if (m != -1) {strencoding = tmp.substring(m + 8).replace("]", "");return strencoding;}}}/** * 通过解析meta得到网页编码 */// 获取网页源码(英文字符和数字不会乱码，所以可以得到正确<meta/>区域)StringBuffer sb = new StringBuffer();String line;try {BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));while ((line = in.readLine()) != null) {sb.append(line);}in.close();} catch (Exception e) { // Report any errors that ariseSystem.err.println(e);System.err.println("Usage:   java   HttpClient   <URL>   [<filename>]");}String htmlcode = sb.toString();// 解析html源码，取出<meta />区域，并取出charsetString strbegin = "<meta";String strend = ">";String strtmp;int begin = htmlcode.indexOf(strbegin);int end = -1;int inttmp;while (begin > -1) {end = htmlcode.substring(begin).indexOf(strend);if (begin > -1 && end > -1) {strtmp = htmlcode.substring(begin, begin + end).toLowerCase();inttmp = strtmp.indexOf("charset");if (inttmp > -1) {strencoding = strtmp.substring(inttmp + 7, end).replace("=", "").replace("/", "").replace("\"", "").replace("\'", "").replace(" ", "");return strencoding;}}htmlcode = htmlcode.substring(begin);begin = htmlcode.indexOf(strbegin);}/** * 分析字节得到网页编码 */strencoding = getFileEncoding(url);// 设置默认网页字符编码if (strencoding == null) {strencoding = "GBK";}return strencoding;}/** *  * <br> * 方法说明：通过网页内容识别网页编码 *  * <br> * 输入参数：strUrl 网页链接; timeout 超时设置 *  * <br> * 返回类型：网页编码 */public static String getFileEncoding(URL url) {java.nio.charset.Charset charset = null;try {charset = detector.detectCodepage(url);} catch (Exception e) {System.out.println(e.getClass() + "分析" + "编码失败");}if (charset != null)return charset.name();return null;}}

阅读全文

0 0