技术改变生活

来源：互联网发布：软件盒子编辑：程序博客网时间：2024/05/17 22:49

数据部的同事，需要从网上下载 pdf 文件。

给定文件夹，其中包含若干个 txt 文件， txt 文件内容，每行均为一个链接，点击链接会直接下载 pdf 文件。

需要，在

1、总文件夹下按照 txt 文件名称创建文件夹，

2、对应 txt 文件中的所有链接，均下载到对应 txt 名称的文件夹下。

3、将所有 pdf 文件，按照下载顺序和页数进行重命名，命名规则为， 1-7.pdf 8-15.pdf 。

现存问题：

1、pdf 下载后，根据页码重命名，顺序不正确会导致命名顺序打乱导致命名出错。

2、错误：文件名称，不能包含上标，下标，等特殊符号，包含特殊符号，会进行 html 转码，则导致创建文件失败。

完整代码如下：

刚开始上传的第一版，报错后不能继续下载，现在，报错后，可以继续下载后面的url 。

错误日志，输出到 txt 目录下，生成 log.txt文件。

package cn.digitalpublishing;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.URL;import java.net.URLDecoder;import com.lowagie.text.pdf.PdfReader;/** *  * @author www.wswhr.com * @see http://blog.csdn.net/u012246342/article/details/60139990   * *现存问题： *1、pdf 下载后，根据页码 重命名，顺序不正确会导致 命名 顺序打乱 导致 命名出错。 * *注： *pdf 文件下载后，重命名问题 ： 需要导入 com.lowagie.text.pdf.PdfReader  jar 包。 */public class Snippet {public static void main(String[] args) {/** * 错误： 文件名称，不能包含 上标，下标，等特殊符号，包含特殊符号，会进行 html 转码，则导致创建文件失败。 * 衍生化技术用于生物基质中性激素LC-MS<i><sup>n<\sup><\i>检测的进展.pdf  */System.out.println("程序开始。");createDir("W://download//pdf//pdfRename");downloadFile("W://download//pdf//pdfRename");// fileRename("W://download//pdf//pdfRename");// pdf 文件的目录System.out.println("程序结束");}public static void errorLog(String logFilePath,String logInfo){FileWriter fw = null;PrintWriter pw = null;try {// 如果文件存在，则追加内容；如果文件不存在，则创建文件File f = new File(logFilePath + "\\log.txt");fw = new FileWriter(f, true);pw = new PrintWriter(fw);pw.println(logInfo);pw.flush();fw.flush();pw.close();fw.close();} catch (IOException e) {e.printStackTrace();}}public static void createDir(String filePath) {try {File dir = new File(filePath);String[] filelist = dir.list();for (int i = 0; i < filelist.length; i++) {File readfile = new File(filePath + "\\" + filelist[i]);if (readfile.isDirectory() && readfile.exists()) {// 判断当前 txt 是否已经创建同名 文件夹errorLog(filePath, "创建目录失败，目标目录已存在！");} else {File fileDir = new File(readfile.getPath().substring(0,readfile.getPath().lastIndexOf('.')));fileDir.mkdirs();}}} catch (Exception e) {e.printStackTrace();}}public static void downloadFile(String txtFilePath) {try {File readerPath = new File(txtFilePath);String[] filelist = readerPath.list();for (int i = 0; i < filelist.length; i++) {File isDir = new File(txtFilePath + "\\" + filelist[i]);if(!isDir.isDirectory()){errorLog(txtFilePath, "******************开始下载：" + isDir.getName() + "文件*******************");FileReader reader = new FileReader(txtFilePath + "\\" + filelist[i]);BufferedReader br = new BufferedReader(reader);String str = null;String decodeFName = "";int num = 1;while ((str = br.readLine()) != null) {URL url = new URL(str);HttpURLConnection conn = (HttpURLConnection) url.openConnection();conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");conn.setConnectTimeout(30000);  conn.setReadTimeout(30000);  String fileName = conn.getHeaderField("Content-Disposition").substring(20);decodeFName = URLDecoder.decode(fileName, "utf-8");String savePath = txtFilePath + "//" + isDir.getName().substring(0,isDir.getName().lastIndexOf('.'));downLoadFromUrl(txtFilePath,isDir.getName(),num,str, num+decodeFName, savePath);num++;}errorLog(txtFilePath, "××××××××××××××××××××"+isDir.getName()  + "文件 下载完成×××××××××××××××××××");br.close();reader.close();}}} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}public static void fileRename(String renameFilePath) {try {File file = new File(renameFilePath);String[] filelist = file.list();int pageNum = 0;for (int i = 0; i < filelist.length; i++) {File readfile = new File(renameFilePath + "\\" + filelist[i]);PdfReader reader = new PdfReader(readfile.getPath());int home, end;int page = reader.getNumberOfPages();System.err.println(page);if (pageNum == 0) {home = 1;end = page;} else {home = pageNum;end = pageNum + page;}// System.out.println("path=" + readfile.getPath());// System.out.println("absolutepath=" +// readfile.getAbsolutePath());// System.out.println("name=" + readfile.getName());String fileName = home + "-" + end + ".pdf";File renameFile = new File(fileName);readfile.renameTo(renameFile);pageNum = pageNum + page + 1;}} catch (Exception e) {}}/** * 从网络Url中下载文件 *  * @param urlStr * @param fileName * @param savePath * @throws IOException */public static void downLoadFromUrl(String homePath,String urlFileName,int errorNum,String urlStr, String fileName,String savePath) throws IOException {try {URL url = new URL(urlStr);HttpURLConnection conn = (HttpURLConnection) url.openConnection();// 设置超时间为3秒conn.setConnectTimeout(3 * 1000);// 防止屏蔽程序抓取而返回403错误conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");// 得到输入流InputStream inputStream = conn.getInputStream();// 获取自己数组byte[] getData = readInputStream(inputStream);// 文件保存位置File saveDir = new File(savePath);if (!saveDir.exists()) {saveDir.mkdir();}File file = new File(saveDir + File.separator + fileName);FileOutputStream fos = new FileOutputStream(file);fos.write(getData);if (fos != null) {fos.close();}if (inputStream != null) {inputStream.close();}String fileGetName = conn.getHeaderField("Content-Disposition").substring(20);String fileSaveName = URLDecoder.decode(fileGetName, "utf-8");errorLog(homePath, "信息 : " + fileSaveName + " 下载完成！");} catch (Exception e) {errorLog(homePath, "~~~~~~~~~~");errorLog(homePath, "出错文件是："+ urlFileName);errorLog(homePath, "出错链接是：" + urlStr);errorLog(homePath, "该文件出错行数是：" + errorNum);errorLog(homePath, "----------");e.printStackTrace();}}public static byte[] readInputStream(InputStream inputStream)throws IOException {try {byte[] buffer = new byte[1024];int len = 0;ByteArrayOutputStream bos = new ByteArrayOutputStream();while ((len = inputStream.read(buffer)) != -1) {bos.write(buffer, 0, len);}bos.close();return bos.toByteArray();} catch (Exception e) {e.printStackTrace();return null;}}}

0 0