Java 将Word2003(doc)/Word2007(docx)转Html格式文件
来源:互联网 发布:网络电视apk源码 编辑:程序博客网 时间:2024/06/13 23:24
作为一个有素质的博主,首先上jar包http://download.csdn.net/download/u010782875/10041502
代码实现:
import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.PicturesManager;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.usermodel.PictureType;import org.apache.poi.xwpf.converter.core.BasicURIResolver;import org.apache.poi.xwpf.converter.core.FileImageExtractor;import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.w3c.dom.Document;import javax.xml.parsers.DocumentBuilderFactory;import java.io.*;import java.nio.file.Path;import java.nio.file.Paths;/** * @ClassName:WordToString * @Description: * @author: * @data:2017/10/24 */public class WordToString { public static void main(String[] args) throws Throwable { //final String path = "D:\\Test\\xxx.doc"; final String filePath = "D:\\Test\\xxx.docx"; readWordToString(filePath); } public static String readWordToString(String filePath) throws Exception{ String str = ""; if (FileNameUtil.isWord2003(filePath)) {// docToHtml(filePath, "D:\\Test\\Word2003(doc).html"); str = docToHtml(filePath, "D:\\Test\\Word2003(doc).html");// System.out.print(doc);// FileNameUtil.StringToFile(doc, "D:\\Test\\xxx.txt");// FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html"); } if (FileNameUtil.isWord2007(filePath)) { str = docxToHtml(filePath, "D:\\Test\\Word2007(docx).html");// System.out.print(docx);// FileNameUtil.StringToFile(docx, "D:\\Test\\xxx.txt");// FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html"); } return str; } /* * doc转换为html * docFilename:源word文件路径 * htmlFilename:生成的html文件路径 */ public static String docToHtml(String docFilename, String targetFileName) throws Exception { final Path imagePath = Paths.get(targetFileName).getParent().resolve("image"); HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFilename)); Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document); // 保存图片,并返回图片的相对路径 wordToHtmlConverter.setPicturesManager(new PicturesManager() { @Override public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) { try (FileOutputStream out = new FileOutputStream(imagePath.resolve(name).toString())) { out.write(content); } catch (Exception e) { e.printStackTrace(); } return "../tmp/image/" + name; } }); wordToHtmlConverter.processDocument(wordDocument); try { String str = ""; FileInputStream in = new FileInputStream(targetFileName); // size 为字串的长度 ,这里一次性读完 int size = in.available(); byte[] buffer = new byte[size]; in.read(buffer); in.close(); str = new String(buffer, "UTF-8"); return str; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } /* * docx转换为html * sourceFilePath:源word文件路径 * targetFileName:生成的html文件路径 */ public static String docxToHtml(String sourceFilePath, String targetFileName) throws Exception { String imagePathStr = Paths.get(targetFileName).getParent().resolve("../tmp/image/word/media").toString(); OutputStreamWriter outputStreamWriter = null; try { XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFilePath)); XHTMLOptions options = XHTMLOptions.create(); // 存放图片的文件夹 options.setExtractor(new FileImageExtractor(new File(imagePathStr))); // html中图片的路径 options.URIResolver(new BasicURIResolver("../tmp/image/word/media")); String str = ""; FileInputStream in = new FileInputStream(targetFileName); // size 为字串的长度 ,这里一次性读完 int size = in.available(); byte[] buffer = new byte[size]; in.read(buffer); in.close(); str = new String(buffer, "UTF-8"); return str; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } }}这里将Word转为html格式的字符串返回给前台,可以通过我下面提供的方法使其转换成txt和html文件
import java.io.*;/** * @ClassName:FileNameUtil * @Description: * @author: * @data:2017/10/24 */public class FileNameUtil { public static boolean isWord2003(String filePath) { return filePath.matches("^.+\\.(?i)(doc)$"); } public static boolean isWord2007(String filePath) { return filePath.matches("^.+\\.(?i)(docx)$"); } public static boolean isExcel2003(String filePath) { return filePath.matches("^.+\\.(?i)(xls)$"); } public static boolean isExcel2007(String filePath) { return filePath.matches("^.+\\.(?i)(xlsx)$"); } public static boolean isPDF(String filePath) { return filePath.matches("^.+\\.(?i)(pdf)$"); } /** * 字符串保存到.txt文件 * @param str * @param filename */ public static void StringToFile(String str, String filename) { try { //创建文件对象 File file = new File(filename); // 向文件写入对象写入信息 FileWriter fileWriter = new FileWriter(file); // 写文件 fileWriter.write(str); // 关闭 fileWriter.close(); } catch (IOException e) { // e.printStackTrace(); } } /** * .txt文件保存为html文件 * @param filePath * @param htmlPosition */ public static void txtToHtml(String filePath, String htmlPosition) { try {// String encoding = "GBK"; String encoding = "UTF-8"; File file = new File(filePath); if (file.isFile() && file.exists()) { // 判断文件是否存在 InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding); // 考虑到编码格式 BufferedReader bufferedReader = new BufferedReader(read); // 写文件 FileOutputStream fos = new FileOutputStream(new File(htmlPosition)); OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); BufferedWriter bw = new BufferedWriter(osw); String lineTxt = null; while ((lineTxt = bufferedReader.readLine()) != null) { bw.write(lineTxt + "</br>"); } bw.close(); osw.close(); fos.close(); read.close(); } else { System.out.println("找不到指定的文件"); } } catch (Exception e) { System.out.println("读取文件内容出错"); e.printStackTrace(); } }}
需要说明的一点,在我导入poi包执行docx文件时会报错,最后通过导入ooxml-schemas-1.1.jar得以解决
阅读全文
0 0
- Java 将Word2003(doc)/Word2007(docx)转Html格式文件
- python 在linux中把doc转换为docx格式文件(支持word97和word2003)
- 使用POI将office(doc/docx/ppt/pptx/xls/xlsx)文件转html格式(附带源码)
- docx4j -- 使用Java处理word2007(.docx)文档
- poi操作word模板(word2003,word2007)
- 【word | html】word(doc | docx) 转 html
- 导入(doc,docx,pdf格式)简历,doc/docx,转pdf格式,并且检索doc,docx,pdf,字段
- 导入(doc,docx,pdf格式)简历,doc/docx,转pdf格式,并且检索doc,docx,pdf,字段
- docx转doc工具(软件)
- POI实现DOC/DOCX转HTML
- Doc、Docx转成HTML
- Java 使用jacob ppt文件转pptx,doc转docx;word 转html、pdf等
- 批量将doc转为docx
- java 读取 word2003 word2007 Excle 2003 Excle2007
- Java读取操作word2003 word2007 word2010文档
- 将PDF格式文件转为DOC格式文件
- java导出doc和docx
- 怎样在Android中解析doc、docx、xls、xlsx格式文件?
- 数据结构与算法C++描述(11)---树及二叉树
- UISplitVc的基本使用
- 联想宣布裁员1000人,摩托罗拉部门被掏空
- 德国监管机构责令Facebook停止通过WhatsApp获取用户信息
- 2016年度“邵逸夫奖” 新鲜出炉:由英美六名科学家囊括
- Java 将Word2003(doc)/Word2007(docx)转Html格式文件
- K12新兵——少儿编程教育剖析、破局
- 锤子科技2016新品发布会确定,10月18日上海见
- 快播接班人!老司机的VR看片神器
- 苹果清理App Store应用,不更新就移除
- guided Filter--引导滤波算法原理及实现
- 图片压缩,输出.png(有可能还是jpg类型,只是后缀为.png而已)
- Java开发代码规范之编程规约(九)
- 国产手机老是耍猴,是因为没当“孙子”?