jsoup抓取url下载excle文件
来源:互联网 发布:社交软件的英语 编辑:程序博客网 时间:2024/05/21 22:31
package com.rquest.webSpider;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.io.FileUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.HttpProtocolParams;import org.apache.poi.util.IOUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;/** * Hello world! * */public class App { public static void main( String[] args ){ List<String> list = getFirstURLs(); List<String> downLoadURLs = getdownLoadURL(list); String dir = downLoadDir(downLoadURLs); try { System.out.println(downLoadURLs.get(0));downloadFile(downLoadURLs.get(0),dir);} catch (Exception e) {e.printStackTrace();}} //根据URL生成文件下载目录public static String downLoadDir(List<String> downLoadURLs) {SimpleDateFormat sim = new SimpleDateFormat("YYYY-MM-dd");Date curDate = new Date();String dirPri = "D:\\WebSpider\\"+sim.format(curDate)+"\\指数样本数据库"; String dirSuf = ".xls"; String dir = dirPri + downLoadURLs.get(0).substring(59, 67) + dirSuf; System.out.println(dir);return dir;}//获取真正的文件下载URLpublic static List<String> getdownLoadURL(List<String> list) {String urlBase = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/"; String urlSecend = null; List<String> list2 = new ArrayList<String>(); String secendURLReg = "[A-Z]\\d{21}\\.xls"; String secendURL = null ; String sub = null ; for (String urlSuf : list) {urlSecend = urlBase + urlSuf;Document doc;try {doc = Jsoup.connect(urlSecend).get();Elements scripts = doc.getElementsByClass("attachments"); String secendURLSuf = RegexString(scripts.get(0).toString(), secendURLReg); //截取secendURLSuf sub = secendURLSuf.substring(2, 8) + "/"; secendURL = urlBase + sub + secendURLSuf;list2.add(secendURL);System.out.println(secendURL);} catch (IOException e) {e.printStackTrace();}} System.out.println(list2.size()); return list2;}//根据下载列表页面得到第一层URLpublic static List<String> getFirstURLs() {// 定义即将访问的链接 String urlPre = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/index"; String urlSuf = ".html"; String url = null; List<String> list = new ArrayList<String>(); String firstURLReg = "\\d{6}\\/t\\d{8}\\w\\d{6}\\.html"; for (int i = 0; i <= 0; i++) { if (i==0) { url = urlPre + urlSuf ;}else{url = urlPre + "_" + i + urlSuf ;} try { Document doc = Jsoup.connect(url).get(); Elements links = doc.getElementsByClass("list").select("a[href]"); for(org.jsoup.nodes.Element e : links){ String firstURL = RegexString(e.toString(), firstURLReg); list.add(firstURL); System.out.println(firstURL); } } catch (IOException e) { e.printStackTrace(); } }return list;} //正则表达式匹配需要的字符串 static String RegexString(String targetStr, String patternStr) { // 定义一个样式模板,此中使用正则表达式,括号中是要抓的内容 // 相当于埋好了陷阱匹配的地方就会掉下去 Pattern pattern = Pattern.compile(patternStr); // 定义一个matcher用来做匹配 Matcher matcher = pattern.matcher(targetStr); // 如果找到了 if (matcher.find()) { // 打印出结果 return matcher.group(); } return "Nothing"; } //通过指定的文件下载URL以及下载目录下载文件public static void downloadFile(String url,String dir)throws Exception{DefaultHttpClient httpClient=new DefaultHttpClient();HttpProtocolParams.setUserAgent(httpClient.getParams(),"Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.1.9)Gecko/20100315Firefox/3.5.9");HttpGet httpGet=new HttpGet();httpGet.setURI(new java.net.URI(url));InputStream input = null;FileOutputStream output = null;try{HttpResponse response=httpClient.execute(httpGet);HttpEntity entity=response.getEntity();input=entity.getContent();File file=new File(dir);output= FileUtils.openOutputStream(file);IOUtils.copy(input,output);System.out.println("成功下载至:"+ dir);}catch(Exception e){e.printStackTrace();}finally{IOUtils.closeQuietly(output);IOUtils.closeQuietly(input);}}}
项目依赖如下:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.rquest</groupId> <artifactId>webSpider</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>webSpider</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi --><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.9</version></dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --><dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.2</version></dependency> <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --><dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.3.4</version></dependency><!-- https://mvnrepository.com/artifact/org.apache.commons/commons-io --><dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-io</artifactId> <version>1.3.2</version></dependency> </dependencies></project>
0 0
- jsoup抓取url下载excle文件
- 利用 jsoup 下载保存文件
- java中用jsoup抓取网页源码,并批量下载图片
- struts 文件下载——导出Excle/导出PDF
- maven+springmvc下载excle文件——ie8可用
- maven+springmvc下载excle文件——ie8可用(二)
- Jsoup抓取页面内容
- JSOUP抓取网页内容
- Jsoup网页抓取技术
- jsoup抓取豆瓣美女
- Jsoup数据抓取
- 网页抓取jsoup
- 使用Jsoup抓取数据
- jsoup数据抓取学习
- Jsoup抓取页面
- Jsoup抓取数据
- Jsoup抓取数据
- Jsoup抓取唐诗三百首
- 007
- 垂直居中
- JavaScript中数组slice和splice的对比小结
- iOS 判断当前APP是否开启定位服务,未开通点击确定进入设置里进行设置
- Android热修复 Tinker
- jsoup抓取url下载excle文件
- 《JAVA与模式》之解释器模式
- jenkins使用publishover ssh插件连接应用机器时,报Message [Auth fail]的问题
- POJ 3273 Monthly Expense——二分
- Huffman coding —— 优先队列
- 遍历集合的3种方法
- VIM编辑器配置文件修改
- Android全屏设置及取消全屏设置
- TCP通信+域名解析实现: