jsoup抓取url下载excle文件

来源：互联网发布：社交软件的英语编辑：程序博客网时间：2024/05/21 22:31

package com.rquest.webSpider;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.io.FileUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.HttpProtocolParams;import org.apache.poi.util.IOUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;/** * Hello world! * */public class App {    public static void main( String[] args ){        List<String> list = getFirstURLs();    List<String> downLoadURLs = getdownLoadURL(list);    String dir = downLoadDir(downLoadURLs);    try {    System.out.println(downLoadURLs.get(0));downloadFile(downLoadURLs.get(0),dir);} catch (Exception e) {e.printStackTrace();}}    //根据URL生成文件下载目录public static String downLoadDir(List<String> downLoadURLs) {SimpleDateFormat sim = new SimpleDateFormat("YYYY-MM-dd");Date curDate = new Date();String dirPri = "D:\\WebSpider\\"+sim.format(curDate)+"\\指数样本数据库";    String dirSuf = ".xls";    String dir = dirPri + downLoadURLs.get(0).substring(59, 67)  + dirSuf;    System.out.println(dir);return dir;}//获取真正的文件下载URLpublic static List<String> getdownLoadURL(List<String> list) {String urlBase = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/";    String urlSecend = null;    List<String> list2 = new ArrayList<String>();    String secendURLReg =  "[A-Z]\\d{21}\\.xls";    String secendURL = null ;    String sub = null ;        for (String urlSuf : list) {urlSecend = urlBase + urlSuf;Document doc;try {doc = Jsoup.connect(urlSecend).get();Elements scripts =  doc.getElementsByClass("attachments");    String secendURLSuf = RegexString(scripts.get(0).toString(), secendURLReg);    //截取secendURLSuf    sub = secendURLSuf.substring(2, 8) + "/";    secendURL = urlBase + sub + secendURLSuf;list2.add(secendURL);System.out.println(secendURL);} catch (IOException e) {e.printStackTrace();}}    System.out.println(list2.size());    return list2;}//根据下载列表页面得到第一层URLpublic static List<String> getFirstURLs() {// 定义即将访问的链接    String urlPre = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/index";    String urlSuf = ".html";    String url = null;    List<String> list = new ArrayList<String>();    String firstURLReg =  "\\d{6}\\/t\\d{8}\\w\\d{6}\\.html";    for (int i = 0; i <= 0; i++) {    if (i==0) {    url = urlPre +  urlSuf ;}else{url = urlPre + "_" + i + urlSuf ;}    try {    Document doc = Jsoup.connect(url).get();    Elements links =  doc.getElementsByClass("list").select("a[href]");    for(org.jsoup.nodes.Element e : links){    String firstURL = RegexString(e.toString(), firstURLReg);    list.add(firstURL);    System.out.println(firstURL);    }            } catch (IOException e) {    e.printStackTrace();    }    }return list;}    //正则表达式匹配需要的字符串    static String RegexString(String targetStr, String patternStr) {    // 定义一个样式模板，此中使用正则表达式，括号中是要抓的内容    // 相当于埋好了陷阱匹配的地方就会掉下去    Pattern pattern = Pattern.compile(patternStr);    // 定义一个matcher用来做匹配    Matcher matcher = pattern.matcher(targetStr);    // 如果找到了    if (matcher.find()) {     // 打印出结果     return matcher.group();    }    return "Nothing";   }        //通过指定的文件下载URL以及下载目录下载文件public static void downloadFile(String url,String dir)throws Exception{DefaultHttpClient httpClient=new DefaultHttpClient();HttpProtocolParams.setUserAgent(httpClient.getParams(),"Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.1.9)Gecko/20100315Firefox/3.5.9");HttpGet httpGet=new HttpGet();httpGet.setURI(new java.net.URI(url));InputStream input = null;FileOutputStream output = null;try{HttpResponse response=httpClient.execute(httpGet);HttpEntity entity=response.getEntity();input=entity.getContent();File file=new File(dir);output= FileUtils.openOutputStream(file);IOUtils.copy(input,output);System.out.println("成功下载至："+ dir);}catch(Exception e){e.printStackTrace();}finally{IOUtils.closeQuietly(output);IOUtils.closeQuietly(input);}}}

项目依赖如下：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>  <groupId>com.rquest</groupId>  <artifactId>webSpider</artifactId>  <version>0.0.1-SNAPSHOT</version>  <packaging>jar</packaging>  <name>webSpider</name>  <url>http://maven.apache.org</url>  <properties>    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>  </properties>  <dependencies>    <dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>3.8.1</version>      <scope>test</scope>    </dependency>        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi --><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi</artifactId>    <version>3.9</version></dependency>            <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --><dependency>    <groupId>org.jsoup</groupId>    <artifactId>jsoup</artifactId>    <version>1.7.2</version></dependency>        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --><dependency>    <groupId>org.apache.httpcomponents</groupId>    <artifactId>httpclient</artifactId>    <version>4.3.4</version></dependency><!-- https://mvnrepository.com/artifact/org.apache.commons/commons-io --><dependency>    <groupId>org.apache.commons</groupId>    <artifactId>commons-io</artifactId>    <version>1.3.2</version></dependency>  </dependencies></project>

0 0