使用poi将word转为html
来源:互联网 发布:淘宝好评20字以上 编辑:程序博客网 时间:2024/05/17 09:13
使用poi将word转为html
需求:将上传的word文档转为html并返回页面填充到富文本编辑器中
使用方法:
1.openoffice出现问题:图片错位;
2.poi将word转为html;图片格式多样,如wmf,emf等文件格式不能在页面上显示;
3.上传文件限定为docx,更改后缀为zip,解压可得到所有图片并且格式为png,但html需使用其他方法获得再修改img标签的图片路径;
本文使用poi将word转为html,图片格式问题之后发文解决。
使用Maven导入jar包
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.14</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.14</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version></dependency><dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>xdocreport</artifactId> <version>1.0.6</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.14</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.3</version></dependency>
PoiUtil.java
import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.usermodel.PictureType;import org.apache.poi.xwpf.converter.core.BasicURIResolver;import org.apache.poi.xwpf.converter.core.FileImageExtractor;import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.w3c.dom.Document;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.transform.OutputKeys;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import java.io.*;import java.util.*;/** * Created by will on 2017/6/9. * 使用poi将word转为html文件,并从文件中读取内容 */public class PoiUtil { // 在html中图片保存的相对路径 private static String imagePath; /** * @param source word文件的File对象 * @param sourceFileName word文件名 * @param savePath 图片保存路径 * @return 转成的html字符串 */ public static String getHtml(File source, String sourceFileName, String savePath) throws Exception { imagePath = "/upload/" + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")); String imagePathStr = savePath + File.separator + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + File.separator; String content; String imgEnd = ""; // 判断word文档类型,使用不同方法进行转换 if (sourceFileName.endsWith(".doc")) { content = docToStr(source, sourceFileName, imagePathStr); } else if (sourceFileName.endsWith(".docx")) { content = docxToStr(source, sourceFileName, imagePathStr); // 转换docx文件得到的图片路径 imgEnd = "word/media/"; } else { return "文件类型错误"; } // 利用正则表达式过滤无用标签和属性 content = RegexAnswerUtil.clear(content); return content; } // doc转换为html public static String docToStr(File source, String sourceFileName, String imagePathStr) throws Exception { String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html"; File target = new File(targetFileName); target.getParentFile().mkdirs(); HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(source)); Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document); // 保存图片,并返回图片的相对路径 wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> { try (FileOutputStream out = new FileOutputStream(new File(imagePathStr + name))) { out.write(content); } catch (Exception e) { e.printStackTrace(); } return imagePath +"/" + name; }); wordToHtmlConverter.processDocument(wordDocument); Document htmlDocument = wordToHtmlConverter.getDocument(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(new File(targetFileName)); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); String content = splitContext(targetFileName); // 删除生成的html文件 File file = new File(targetFileName); file.delete(); return content; } // docx转换为html public static String docxToStr(File source, String sourceFileName, String imagePathStr) throws Exception { String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html"; File target = new File(targetFileName); target.getParentFile().mkdirs(); OutputStreamWriter outputStreamWriter = null; try { XWPFDocument document = new XWPFDocument(new FileInputStream(source)); XHTMLOptions options = XHTMLOptions.create(); // 存放图片的文件夹 options.setExtractor(new FileImageExtractor(new File(imagePathStr))); // html中图片的路径 options.URIResolver(new BasicURIResolver(imagePath)); outputStreamWriter = new OutputStreamWriter(new FileOutputStream(target), "utf-8"); XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance(); xhtmlConverter.convert(document, outputStreamWriter, options); } finally { if (outputStreamWriter != null) { outputStreamWriter.close(); } } String content = splitContext(targetFileName); // 删除生成的html文件 File file = new File(targetFileName); file.delete(); return content; } /** * docx文件转html会生成html编码 * 该方法能转换大部分 * 富文本编辑器中可以不做处理 */ public static String htmlEncoding(String html) { String regExp = "&#\\d*;"; Matcher m = Pattern.compile(regExp).matcher(html); StringBuffer sb = new StringBuffer(); if (!m.find()) { sb.append(html); } else { while (m.find()) { String s = m.group(0); s = s.replaceAll("(&#)|;", ""); char c = (char) Integer.parseInt(s); m.appendReplacement(sb, Character.toString(c)); } } return sb.toString(); } /** * 读取转换得到的html文件,并过滤多余空行 */ public static String splitContext(String filePath) { File file = new File(filePath); BufferedReader reader = null; try { InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8"); reader = new BufferedReader(isr); StringBuilder sb = new StringBuilder(); String tempString = null; // 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { sb.append(tempString); if(!tempString.equals("")){ sb.append("\n"); } } reader.close(); String content = sb.toString().replaceAll("\\n+", "\n"); return content; } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } return ""; }}
RegexAnswerUtil.java
import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by will on 2017/6/9. * 清除无用的标签和属性 */public class RegexAnswerUtil { /** * @param returnString html字符串 * @return 过滤后的html字符串 */ public static String clear(String returnString){ int start = returnString.indexOf("<body")==-1?0:returnString.indexOf(">", returnString.indexOf("<body"))+1; int end = returnString.indexOf("</body>")==-1?returnString.length():returnString.indexOf("</body>"); returnString = returnString.substring(start, end); Pattern pattern = Pattern.compile( "(<\\w+\\s*[^>]+?>)", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(returnString); while (matcher.find()) { String group = matcher.group(); if (group == null) { continue; } String sub = matcher.group(); String imageRegex = "<img.*?(src[=]\"[^\"]+\")[^>]+?>"; returnString = returnString.replaceAll(imageRegex, "<img $1/>"); String otherRegex = "<(?!img)(\\w+)\\s[^>]+>"; Pattern sub_p = Pattern.compile(otherRegex); Matcher m_html = sub_p.matcher(sub); String newSub = m_html.replaceAll("<$1>"); returnString = returnString.replace(sub, newSub); } return returnString; }}
阅读全文
0 0
- 使用poi将word转为html
- word转为html(poi)
- 使用poi将word转换为html
- 使用Java将Word转为Html或txt!
- 使用Java将Word转为Html或txt!
- 使用Java将Word转为Html或txt!
- 使用Jcob将Word转为Html或txt
- 使用Java将Word转为Html或txt!
- 使用Jacob将Word转为Html或txt
- 使用Java将Word转为Html或txt等
- 使用java框架POI将word转换成html格式
- Java使用poi将word转换为html
- 使用Java的POI工具进行Word的DOC文档转为HTML页面技术简介
- javascript 将HTML转为 word,pdf
- poi将html导出到word
- poi 将word文档转HTML格式
- poi将word docx转化为html
- 使用Java将Word转为Html或txt等···
- 初学Spring Boot遇到的启动问题
- jquery中文档处理的学习
- cookie和session工作原理
- 顺序表应用8:最大子段和之动态规划法
- Linux多线程下的互斥锁
- 使用poi将word转为html
- React-Native Demo 工程 TransformError babel-preset-react-native
- Android笔记 SystemServer
- QT整理之HelloWorld测试例子
- 实现从oss(阿里云)服务器以附件形式下载文件(含批量下载)
- MySQL 外键约束的参照操作
- 高德云地图简单使用
- C头文件
- 【Java集合类】HashMap解析