Csdn博客下载器Java版-Snapshot版（开发中）

来源：互联网发布：cnc软件下载编辑：程序博客网时间：2024/06/01 12:53

版本的界面如下：

【A】还要额外开启一个tomcat，吧html方到上面不然pd4ml无法解析http协议以外的文件，pdf会为空的如果谁有好的方法可以通知我

【B】最终的执行效果

【C】生成的分散文件

查看pdf是否写入：乱码还没有解决呢

【D】

1 下载器通过输入博客人的名字实现自动下载--支持多个目录的生产

现在有几个功能点因为其他原因没有完成，先写个草稿版本，供以后完成

本来要使用多线性map-reduce技术快速生成pdf，实际开发中发现受制于网速带宽，所以不再使用此技术

2 缺少的功能点

a：需要额外开一个项目用tomcat部署，为了pd4ml可以读取http协议，这个有待改进

b：生成的临时pdf为中文乱码，没有实现gbk到utf-8的转换

c：没有写最后一步和pdf

所以先写个临时版本吧

3 目录结构

4 主要代码流程：

[1] ui界面设计

package com.blog.csdn.ui;import java.awt.BorderLayout;import java.awt.Container;import java.awt.Dimension;import java.awt.FlowLayout;import java.awt.Toolkit;import java.awt.event.ActionEvent;import java.awt.event.ActionListener;import javax.swing.JButton;import javax.swing.JFrame;import javax.swing.JLabel;import javax.swing.JOptionPane;import javax.swing.JPanel;import javax.swing.JScrollPane;import javax.swing.JTextArea;import javax.swing.JTextField;import com.blog.csdn.common.Message;import com.blog.csdn.download.HtmlBuilder;import com.blog.csdn.download.HtmlParser;import com.blog.csdn.download.ProjController;import com.blog.csdn.pdf.BuildSinglePdf;/** * 只看功能，不重视外观 * @author chaigw * */public class ConfigFrame extends JFrame {Container container;public ConfigFrame() {this.setSize(400, 300);this.setTitle("CSDN 博客下载器");Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize();Dimension frameSize = this.getSize();this.setLocation((screenSize.width - frameSize.width) / 2, (screenSize.height - frameSize.height) / 2);container = this.getContentPane();this.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);}public ConfigFrame(String blogName) {this();fillBlogName(blogName);}public void fillBlogName(String blogName){JLabel lb = new JLabel("http://blog.csdn.net/");//此处需添加一点就没的功能，以后再说final JTextField tf = new JTextField(blogName);tf.setColumns(6);tf.setText("qinhl99");tf.setText(tf.getText().toLowerCase());JButton bt = new JButton("生成PDF");JLabel thLb = new JLabel("线程");final JTextField thTf = new JTextField();thTf.setText("3");thTf.setColumns(2);FlowLayout flow = new FlowLayout();JPanel req = new JPanel();req.setLayout(flow);req.add(lb);req.add(tf);req.add(thLb);req.add(thTf);req.add(bt);bt.addActionListener(new ActionListener() {public void actionPerformed(ActionEvent e) {new Thread(new Runnable() {public void run() {System.out.println(tf.getText());if(null == tf.getText() || "".equals(tf.getText().trim())){JOptionPane.showMessageDialog(ConfigFrame.this, "请输入博客人姓名");return;}Message.bolgName = tf.getText().trim();if(null == thTf.getText() || "".equals(thTf.getText().trim())){JOptionPane.showMessageDialog(ConfigFrame.this, "线程数未填，默认为1");Message.threadNum = 1;thTf.setText("1");}else {try{Message.threadNum = Integer.parseInt(thTf.getText());}catch(Exception e2){JOptionPane.showMessageDialog(ConfigFrame.this, "请输入正确整形数字");return;}}HtmlBuilder.frame = ConfigFrame.this;// 此处开始写调用后台处理的代码Message.bolgName = tf.getText().trim();HtmlParser parser = new HtmlParser();parser.setFrame(ConfigFrame.this);HtmlBuilder builder = new HtmlBuilder();ProjController controller = new ProjController(parser, builder);controller.buildMenuFiles(Message.bolgName);////放在文件中建立controller.buildHtmlFiles();BuildSinglePdf pdf = new BuildSinglePdf();controller.buildSinglePdf(pdf);}}).start();}});JScrollPane scroLog = new JScrollPane();scroLog.setPreferredSize (new Dimension (320,220));area = new JTextArea(10, 30);area.setLineWrap(true);scroLog.setViewportView(area);container.add(req,BorderLayout.CENTER);container.add(scroLog,BorderLayout.SOUTH);}private JTextArea area;public JTextArea getArea() {return area;}public void setArea(JTextArea area) {this.area = area;}public static void main(String[] args) {ConfigFrame configFrame  = new ConfigFrame("");configFrame.setVisible(true);}}

【2】总的流程控制器，负责获取目录，建立目录，建立html，建立临时pdf，建立总的pdf

package com.blog.csdn.download;import java.util.List;import java.util.Map;import com.blog.csdn.common.Message;import com.blog.csdn.pdf.BuildSinglePdf;/** * 项目进程控制器 *  * @author chaigw */public class ProjController {private HtmlParser htmlParser;private HtmlBuilder htmlBuilder;public ProjController() {}public ProjController(HtmlParser htmlParser, HtmlBuilder htmlBuilder) {this.htmlParser = htmlParser;this.htmlBuilder = htmlBuilder;}public void buildMenuFiles(String blogName) {String allContent = htmlParser.parse("http://blog.csdn.net/" + blogName);htmlParser.filtMainMenuContent(allContent);for (Map.Entry<String, String> entry : Message.menuMap.entrySet()) {htmlBuilder.createFolder(Message.getPrexMenu(entry.getKey()));}}/** * 1进行解析目录下文章标题 * 2生成路径html * 3添加html */public void buildHtmlFiles() {htmlParser.parseHtmls();}public void buildSinglePdf(BuildSinglePdf singlePdf){List<String> l = Message.getMenus();for (int i = 0; i < l.size(); i++) {singlePdf.buildSinglgPdf(l.get(i));}}}

【3】程序中用到的存储所有数据和工作操作方法的类

package com.blog.csdn.common;import java.io.File;import java.io.FileFilter;import java.util.ArrayList;import java.util.LinkedHashMap;import java.util.List;import java.util.Map;public class Message {public static String bolgName = "";public static int threadNum = 1;public static String prexMenu ="";public static String prexHtml = "";/** * 下载路径被写死 */public static String downPathString = "D:/csdn_pdf";public static String prexBuildPdfUrl = "http://localhost:8080/web-csdnblog";/** * csdn base路径 */public static String baseUrl = "http://blog.csdn.net";/** * key:目录名称  * value:目录对应url */public static Map<String,String> menuMap = new LinkedHashMap<String,String>();/** * key:文章名称 * value:文章对应url */public static Map<String,String> alticalMap = new LinkedHashMap<String,String>();/** * 目录和文章名称一对多的关系 */public static Map<String, List<String>> menuAlticals = new LinkedHashMap<String,List<String>>();public static String getPrexMenu(String menuKey){//示例 M0001prexMenu = menuKey.substring(0, 5);return prexMenu;}public static String getPrexHtml(String htmlKey){//示例 M0001F0001prexHtml = htmlKey.substring(0,10);return prexHtml;}public static String buildPrexMenu(int index){String prex = "";if(index<10){prex="M000"+index;}else if(index<100){prex="M00"+index;}else if(index<1000){prex="M0"+index;}else {prex="M"+index;}return prex;}public static String buildPrexHtml(int index){String prex = "";if(index<10){prex="F000"+index;}else if(index<100){prex="F00"+index;}else if(index<1000){prex="F0"+index;}else {prex="F"+index;}return prex;}/** * 获取当前html所有文件 * @return */public static List<String> getCurrentPathHtmls(String currentPath){List<String> paths = new ArrayList<String>();File[] files= new File(currentPath).listFiles(new FileFilter() {@Overridepublic boolean accept(File pathname) {String temp = pathname.getAbsolutePath();if(".html".equals(temp.substring(temp.length()-5,temp.length()))){System.out.println(temp+"过滤路径");return true;}return false;}});for (int i = 0; i < files.length; i++) {if(files[i]==null){System.out.println("过滤的文件为null");}paths.add(files[i].getAbsolutePath());}return paths;}public static List<String> getMenus(){List<String> fileMenus = new ArrayList<String>();File[] menuFiles = new File(Message.downPathString+"/"+Message.bolgName).listFiles();if(menuFiles==null) return null;for (int i = 0; i < menuFiles.length; i++) {fileMenus.add(menuFiles[i].getAbsolutePath());}return fileMenus;}}

【4】文件进行从网上获取数据的类，用到了大量的正则

package com.blog.csdn.download;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import java.net.HttpURLConnection;import java.net.URL;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import b.b.b.c.c;import com.blog.csdn.common.Message;import com.blog.csdn.ui.ConfigFrame;import com.itextpdf.text.pdf.PdfStructTreeController.returnType;/** * 负责从网络截取html代码 本项目的代码足够冗余，实在懒得重构了，就这样写了啊 *  * @author chaigw */public class HtmlParser {public String getHtmlContent(URL url, String encode) {StringBuffer contentBuffer = new StringBuffer();int responseCode = -1;HttpURLConnection con = null;try {con = (HttpURLConnection) url.openConnection();con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");// IE代理进行下载con.setConnectTimeout(60000);con.setReadTimeout(60000);// 获得网页返回信息码responseCode = con.getResponseCode();if (responseCode == -1) {System.out.println(url.toString() + " : connection is failure...");con.disconnect();return null;}if (responseCode >= 400) // 请求失败{System.out.println("请求失败:get response code: " + responseCode);con.disconnect();return null;}InputStream inStr = con.getInputStream();InputStreamReader istreamReader = new InputStreamReader(inStr, encode);BufferedReader buffStr = new BufferedReader(istreamReader);String str = null;while ((str = buffStr.readLine()) != null)contentBuffer.append(str);inStr.close();} catch (IOException e) {e.printStackTrace();contentBuffer = null;System.out.println("error: " + url.toString());} finally {con.disconnect();}return contentBuffer.toString();}public String getHtmlContent(String url, String encode) {if (!url.toLowerCase().startsWith("http://")) {url = "http://" + url;}try {URL rUrl = new URL(url);return getHtmlContent(rUrl, encode);} catch (Exception e) {e.printStackTrace();return null;}}public static void main(String argsp[]) {HtmlParser parse = new HtmlParser();// parse.parse("http://blog.csdn.net/cgwcgw_/article/details/17531323");// parse.parse("http://blog.csdn.net/cgwcgw_");Message.bolgName = "cgwcgw_";Message.menuMap.put("M0001abc", "http://blog.csdn.net/cgwcgw_/article/category/1474691");parse.parseHtmls();}public String parse(String alitbaseAllUrl) {String allContent = getHtmlContent(alitbaseAllUrl, "UTF-8");// filterAlticalContent(allContent); 获取文章的// 获取目录的// filtMainMenuContent(allContent);return allContent;}public void parseHtmls() {for (Map.Entry<String, String> entry : Message.menuMap.entrySet()) {pagrationLoop(entry,"",true);currentNum = 0;//每完成一个目录就进行写htmladdHtmlsContent(entry);}}public void addHtmlsContent(Map.Entry<String, String> menuEntry){for(Map.Entry<String, String> alticalEntry : Message.alticalMap.entrySet()){//如果是同一个目录的就进行填写htmlif(alticalEntry.getKey().startsWith(Message.getPrexMenu(menuEntry.getKey()))){FileOutputStream outputStream;try {outputStream = new FileOutputStream(Message.downPathString+"/"+Message.bolgName+"/"+Message.getPrexMenu(alticalEntry.getKey())+"/"+Message.getPrexHtml(alticalEntry.getKey())+".html");BufferedOutputStream bufferStream = new BufferedOutputStream(outputStream);//根据匹配获取文章内容String content  = getHtmlsContent(alticalEntry.getValue());bufferStream.write(content.getBytes(),0,content.getBytes().length);bufferStream.flush();bufferStream.close();outputStream.close();System.out.println("写入临时文件:"+Message.getPrexHtml(alticalEntry.getKey())+".html");frame.getArea().append("写入临时文件:"+Message.getPrexHtml(alticalEntry.getKey())+".html\n");} catch (FileNotFoundException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}}}public void writeAlticalIo(String content){}public String getHtmlsContent(String alticalUrl){String pageContent = parse(Message.baseUrl+"/"+alticalUrl);String content="";String regex = "<div id=\"article_content[\\s\\S]*<div id=\"bdshare";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(pageContent);if(matcher.find()){content=matcher.group();}if("".equals(content)){System.out.println("写入文章内容为空，匹配错误");}System.out.println(content);return content;}public void pagrationLoop(Map.Entry<String, String> entry, String url, boolean flg){String singleMenuAltibases = parse(flg? entry.getValue():Message.baseUrl+url);String filtersingleMenuAltibasesContent = filterSingleMenuAltibases(singleMenuAltibases);System.out.println(filtersingleMenuAltibasesContent);//暂时注解String filtersingleMenuAltibasesContents = getSingleMenuAltibases(filtersingleMenuAltibasesContent,Message.getPrexMenu(entry.getKey()));getPagration(entry,singleMenuAltibases);}public void getPagration(Map.Entry<String, String> entry,String filtersingleMenuAltibasesContent){String content = "";//String regex="<div id=\"papelist\"[\\s\\S]*?<div class=\"clear\">";String regex="<div id=\"papelist\"[\\s\\S]*?<div class=\"clear\">";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(filtersingleMenuAltibasesContent);if(matcher.find()){content = matcher.group();}if(content!=""){ regex="<a href=\"[\\s\\S]*?\">下一页"; pattern = Pattern.compile(regex); matcher = pattern.matcher(content); if (matcher.find()) { content=matcher.group(); content=content.replaceAll("\">下一页", ""); content=content.replaceAll("[\\s\\S]*<a href=\"", ""); System.out.println(content); if(!content.startsWith("/")) { System.out.println("达到了分页的最后一页"); }else { pagrationLoop(entry,content,false); System.out.println("此处有分页"); } }}else{System.out.println("没有获取分页栏--说明只有一页");}}public String getSingleMenuAltibases(String filtersingleMenuAltibasesContent, String prexMenu) {List<String> listKey = new ArrayList<String>();List<String> listValue = new ArrayList<String>();String content = "";String regex = "/" + Message.bolgName + "/article/details/[0-9]*";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(filtersingleMenuAltibasesContent);int i = 0, j = 0;while (matcher.find()) {String temp = null;temp = matcher.group();System.out.println(temp);if (i % 3 == 0) {listValue.add(temp);j++;}i++;}i = 0;j = 0;System.out.println(listValue.size());regex = "<a href=[\\s\\S]*?>[\\s\\S]*?</a></span>";pattern = Pattern.compile(regex);matcher = pattern.matcher(filtersingleMenuAltibasesContent);while (matcher.find()) {String temp = "";temp = matcher.group();// temp = temp.replaceAll("<a href=[\\s\\S]*?>        ", "");temp = temp.replaceAll("[\\s\\S]*</span>    <h3>        ", "");temp = temp.replaceAll("</a></span>", "");temp = temp.replaceAll("<span class[\\s\\S]*>", "").trim();listKey.add(prexMenu + Message.buildPrexHtml(j+currentNum) + temp);System.out.println(temp);j++;i++;}currentNum += listKey.size();i = 0;System.out.println(listKey.size());if (listKey.size() != listValue.size()) {System.out.println("文章路径url和文章数不匹配出错");}for (int m = 0; m < listKey.size(); m++) {Message.alticalMap.put(listKey.get(m), listValue.get(m));htmlBuilder.createHtmlFile(Message.getPrexHtml(listKey.get(m)));}System.out.println(Message.alticalMap);listKey.clear();listValue.clear();return null;}int currentNum = 0;HtmlBuilder htmlBuilder = new HtmlBuilder();public void getCurrentPageNum(){}public String filterSingleMenuAltibases(String singleMenuAltibases) {String content = "";//String regex = "<div id=\"article_list\"[\\s\\S]*<div id=\"papelist\"";String regex = "<div id=\"article_list\"[\\s\\S]*";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(singleMenuAltibases);while (matcher.find()) {content = matcher.group();System.out.println(content);}return content;}/** * 获取menu代码，附带了一些额外的冗余代码 *  * @param allContent */public void filtMainMenuContent(String allContent) {String mainContent = getMainMenuContent(allContent);saveMenuContent(mainContent);}public String getMainMenuContent(String allContent) {String content = "";try {String regex = "<ul class=\"panel_body\">[\\s\\S]*</ul>[\\s\\S]*panel_Archive";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(allContent);if (matcher.find()) {content = matcher.group();System.out.println("test" + content);}} catch (Exception e) {System.out.println("博客人名字不存在");if(frame!=null){frame.getArea().append("博客人名字不存在\n");}}return content;}public void saveMenuContent(String mainContent) {List<String> listKey = new ArrayList<String>();List<String> listValue = new ArrayList<String>();String content = "";String regex = "http://blog[\\s\\S]*?/[0-9]+";// 匹配keyPattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(mainContent);while (matcher.find()) {String temp = matcher.group();System.out.println(temp);listKey.add(temp);}System.out.println(listKey.size());regex = "\">+[\\s\\S]*?</a>";// 匹配valuematcher = Pattern.compile(regex).matcher(mainContent);while (matcher.find()) {String temp = matcher.group();temp = temp.replaceAll("[\\s\\S]*\">", "");temp = temp.replaceAll("</a>", "");System.out.println(temp);listValue.add(temp);}System.out.println(listValue.size());// value比key多1，所以从后面一个开始if (1 == listValue.size() - listKey.size()) {for (int i = 0; i < listKey.size(); i++) {Message.menuMap.put(Message.buildPrexMenu(i) + listValue.get(i + 1), listKey.get(i));}System.out.println(Message.menuMap);} else {System.out.println("目录匹配出现了异常");}}private ConfigFrame frame;public ConfigFrame getFrame() {return frame;}public void setFrame(ConfigFrame frame) {this.frame = frame;}public void filtAlticalContent(String allContent) {String content = getAlticalAllContent(allContent);System.out.println(content);}/** * 获取文章内容(所有内容)包含了html *  * @param allContent * @return */public String getAlticalAllContent(String allContent) {String content = "";String s = allContent;String regex = "<div id=\"article_details\"[\\w\\W]*<!-- Baidu Button END --></div>";Pattern pt = Pattern.compile(regex);Matcher mt = pt.matcher(s);if (mt.find()) {content = mt.group();}return content;}/** * 获取文章目录 */public String getAlticalMenu(String message) {return null;}}

【5】 html文件生成类

package com.blog.csdn.download;import java.awt.Frame;import java.io.File;import java.io.IOException;import javax.swing.RootPaneContainer;import com.blog.csdn.common.Message;import com.blog.csdn.ui.ConfigFrame;/** * 要先生成html，然后才能生成pdf，没有直接就吧html代码生成pdf的方法呢，需要中转一下 * @author chaigw * */public class HtmlBuilder {static{//跟路径initPathFile(Message.downPathString);//博客主人路径initPathFile(Message.downPathString+"/"+Message.bolgName);}/** * 生成目录 * @param menu */public void createFolder(String menu){if(null == menu || "".equals(menu.trim())){System.out.println("目录为空--有异常");return;}initPathFile(Message.downPathString+"/"+Message.bolgName+"/"+menu);}/** *  */public void createHtmlFile(String htmlFile){initPathHtml(Message.downPathString+"/"+Message.bolgName+"/"+htmlFile.substring(0,5)+"/"+htmlFile+".html");}public void initPathHtml(String path){File rootFile = new File(path);if(!rootFile.exists()){try {rootFile.createNewFile();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}System.out.println("构建文件夹路径"+path);}}public static  void initPathFile(String path){File rootFile = new File(path);if(!rootFile.exists()){rootFile.mkdirs();System.out.println("构建文件夹路径"+path);if(frame!=null){frame.getArea().append("构建目录路径"+path+"\n");}}}public static ConfigFrame frame;public static void main(String[] args) {HtmlBuilder builder = new HtmlBuilder();builder.createFolder("");}}

【6】 pdf生成临时的单独文件的类

package com.blog.csdn.pdf;import java.awt.Insets;import java.io.File;import java.io.IOException;import java.net.MalformedURLException;import java.net.URL;import java.security.InvalidParameterException;import java.util.List;import org.zefer.pd4ml.PD4Constants;import org.zefer.pd4ml.PD4ML;import org.zefer.pd4ml.tools.PD4Browser.Rule;import com.blog.csdn.common.Message;/** * 生成单个的pdf用pd4ml *  * @author chaigw */public class BuildSinglePdf {protected int topValue = 10;protected int leftValue = 20;protected int rightValue = 10;protected int bottomValue = 10;protected int userSpaceWidth = 1300;public static void main(String[] args) {}public void buildSinglgPdf(String currentPath) {try {List<String> paths=Message.getCurrentPathHtmls(currentPath);for (int i = 0; i < paths.size(); i++) {BuildSinglePdf jt = new BuildSinglePdf();// jt.doConversion("http://pd4ml.com/sample.htm", "c:/pd4ml.pdf");// jt.doConversion("file:///d:/csdn_pdf/web-csdnblog/estelle_belle/M0000/M0000F0000.html",// "c:/pd4ml.pdf");String tempPath = paths.get(i);tempPath = tempPath.replace("\\", "/");String url = Message.prexBuildPdfUrl +currentPath.replace("D:\\csdn_pdf\\", "/")+"\\"+tempPath.substring(tempPath.lastIndexOf("/")+1,tempPath.length());url = url.replace("\\","/");String output = currentPath+tempPath.substring(tempPath.lastIndexOf("/"),tempPath.length()).replace("/", "\\").replace("html", "pdf");//output = output.replace("\\", "/");jt.doConversion(url,output);}} catch (Exception e) {e.printStackTrace();}}public void doConversion(String url, String outputPath) throws InvalidParameterException, MalformedURLException, IOException {File output = new File(outputPath);java.io.FileOutputStream fos = new java.io.FileOutputStream(output);PD4ML pd4ml = new PD4ML();pd4ml.setHtmlWidth(userSpaceWidth); // set frame width of// "virtual web browser"// choose target paper format and "rotate" it to landscape orientationpd4ml.setPageSize(pd4ml.changePageOrientation(PD4Constants.A4));// define PDF page marginspd4ml.setPageInsetsMM(new Insets(topValue, leftValue, bottomValue, rightValue));// source HTML document also may have margins, could be suppressed this// way// (PD4ML *Pro* feature):pd4ml.addStyle("BODY {margin: 0}", true);// If built-in basic PDF fonts are not sufficient or// if you need to output non-Latin texts,// TTF embedding feature should help (PD4ML *Pro*)pd4ml.useTTF("c:/windows/fonts", true);pd4ml.render(new URL(url), fos); // actual document conversion from URL// to filefos.close();System.out.println(outputPath + "\ndone.");}}

【7】能够生成带目录的pdf的类，参加本博客的poi的专门一篇关于可以生成目录的类，这块还没有写

下面还缺少把pdf合并的代码，时间有限，先写这么多吧

0 0