使用poi将word转为html

来源:互联网 发布:淘宝好评20字以上 编辑:程序博客网 时间:2024/05/17 09:13

使用poi将word转为html

需求:将上传的word文档转为html并返回页面填充到富文本编辑器中
使用方法:
1.openoffice出现问题:图片错位;
2.poi将word转为html;图片格式多样,如wmf,emf等文件格式不能在页面上显示;
3.上传文件限定为docx,更改后缀为zip,解压可得到所有图片并且格式为png,但html需使用其他方法获得再修改img标签的图片路径;
本文使用poi将word转为html,图片格式问题之后发文解决。

使用Maven导入jar包

<dependency>  <groupId>org.apache.poi</groupId>  <artifactId>poi</artifactId>  <version>3.14</version></dependency><dependency>  <groupId>org.apache.poi</groupId>  <artifactId>poi-scratchpad</artifactId>  <version>3.14</version></dependency><dependency>  <groupId>org.apache.poi</groupId>  <artifactId>poi-ooxml</artifactId>  <version>3.14</version></dependency><dependency>  <groupId>fr.opensagres.xdocreport</groupId>  <artifactId>xdocreport</artifactId>  <version>1.0.6</version></dependency><dependency>  <groupId>org.apache.poi</groupId>  <artifactId>poi-ooxml-schemas</artifactId>  <version>3.14</version></dependency><dependency>  <groupId>org.apache.poi</groupId>  <artifactId>ooxml-schemas</artifactId>  <version>1.3</version></dependency>

PoiUtil.java

import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.usermodel.PictureType;import org.apache.poi.xwpf.converter.core.BasicURIResolver;import org.apache.poi.xwpf.converter.core.FileImageExtractor;import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.w3c.dom.Document;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.transform.OutputKeys;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import java.io.*;import java.util.*;/** * Created by will on 2017/6/9. * 使用poi将word转为html文件,并从文件中读取内容 */public class PoiUtil {    // 在html中图片保存的相对路径    private static String imagePath;    /**    * @param source word文件的File对象    * @param sourceFileName word文件名    * @param savePath 图片保存路径    * @return 转成的html字符串    */    public static String getHtml(File source, String sourceFileName, String savePath) throws Exception {        imagePath = "/upload/" + sourceFileName.substring(0, sourceFileName.lastIndexOf("."));        String imagePathStr = savePath + File.separator + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + File.separator;        String content;        String imgEnd = "";        // 判断word文档类型,使用不同方法进行转换        if (sourceFileName.endsWith(".doc")) {            content = docToStr(source, sourceFileName, imagePathStr);        } else if (sourceFileName.endsWith(".docx")) {            content = docxToStr(source, sourceFileName, imagePathStr);            // 转换docx文件得到的图片路径            imgEnd = "word/media/";        } else {            return "文件类型错误";        }        // 利用正则表达式过滤无用标签和属性        content = RegexAnswerUtil.clear(content);        return content;    }    // doc转换为html    public static String docToStr(File source, String sourceFileName, String imagePathStr) throws Exception {        String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html";        File target = new File(targetFileName);        target.getParentFile().mkdirs();        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(source));        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);        // 保存图片,并返回图片的相对路径        wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {            try (FileOutputStream out = new FileOutputStream(new File(imagePathStr + name))) {                out.write(content);            } catch (Exception e) {                e.printStackTrace();            }            return imagePath +"/" + name;        });        wordToHtmlConverter.processDocument(wordDocument);        Document htmlDocument = wordToHtmlConverter.getDocument();        DOMSource domSource = new DOMSource(htmlDocument);        StreamResult streamResult = new StreamResult(new File(targetFileName));        TransformerFactory tf = TransformerFactory.newInstance();        Transformer serializer = tf.newTransformer();        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");        serializer.setOutputProperty(OutputKeys.INDENT, "yes");        serializer.setOutputProperty(OutputKeys.METHOD, "html");        serializer.transform(domSource, streamResult);        String content = splitContext(targetFileName);        // 删除生成的html文件        File file = new File(targetFileName);        file.delete();        return content;    }    // docx转换为html    public static String docxToStr(File source, String sourceFileName, String imagePathStr) throws Exception {        String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html";        File target = new File(targetFileName);        target.getParentFile().mkdirs();        OutputStreamWriter outputStreamWriter = null;        try {            XWPFDocument document = new XWPFDocument(new FileInputStream(source));            XHTMLOptions options = XHTMLOptions.create();            // 存放图片的文件夹            options.setExtractor(new FileImageExtractor(new File(imagePathStr)));            // html中图片的路径            options.URIResolver(new BasicURIResolver(imagePath));            outputStreamWriter = new OutputStreamWriter(new FileOutputStream(target), "utf-8");            XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();            xhtmlConverter.convert(document, outputStreamWriter, options);        } finally {            if (outputStreamWriter != null) {                outputStreamWriter.close();            }        }        String content = splitContext(targetFileName);        // 删除生成的html文件        File file = new File(targetFileName);        file.delete();        return content;    }    /**    * docx文件转html会生成html编码    * 该方法能转换大部分    * 富文本编辑器中可以不做处理    */    public static String htmlEncoding(String html) {        String regExp = "&#\\d*;";        Matcher m = Pattern.compile(regExp).matcher(html);        StringBuffer sb = new StringBuffer();        if (!m.find()) {            sb.append(html);        } else {            while (m.find()) {                String s = m.group(0);                s = s.replaceAll("(&#)|;", "");                char c = (char) Integer.parseInt(s);                m.appendReplacement(sb, Character.toString(c));            }        }        return sb.toString();    }    /**    * 读取转换得到的html文件,并过滤多余空行    */    public static String splitContext(String filePath) {        File file = new File(filePath);        BufferedReader reader = null;        try {            InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");            reader = new BufferedReader(isr);            StringBuilder sb = new StringBuilder();            String tempString = null;            // 一次读入一行,直到读入null为文件结束            while ((tempString = reader.readLine()) != null) {                sb.append(tempString);                if(!tempString.equals("")){                    sb.append("\n");                }            }            reader.close();            String content = sb.toString().replaceAll("\\n+", "\n");            return content;        } catch (IOException e) {            e.printStackTrace();        } finally {            if (reader != null) {                try {                    reader.close();                } catch (IOException e1) {                }            }        }        return "";    }}

RegexAnswerUtil.java

import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by will on 2017/6/9. * 清除无用的标签和属性 */public class RegexAnswerUtil {    /**    * @param returnString html字符串    * @return 过滤后的html字符串    */    public static String clear(String returnString){        int start = returnString.indexOf("<body")==-1?0:returnString.indexOf(">", returnString.indexOf("<body"))+1;        int end = returnString.indexOf("</body>")==-1?returnString.length():returnString.indexOf("</body>");        returnString = returnString.substring(start, end);        Pattern pattern = Pattern.compile(                "(<\\w+\\s*[^>]+?>)",                Pattern.CASE_INSENSITIVE);        Matcher matcher = pattern.matcher(returnString);        while (matcher.find()) {            String group = matcher.group();            if (group == null) {                continue;            }            String sub = matcher.group();            String imageRegex = "<img.*?(src[=]\"[^\"]+\")[^>]+?>";            returnString = returnString.replaceAll(imageRegex, "<img $1/>");            String otherRegex = "<(?!img)(\\w+)\\s[^>]+>";            Pattern sub_p = Pattern.compile(otherRegex);            Matcher m_html = sub_p.matcher(sub);            String newSub = m_html.replaceAll("<$1>");            returnString = returnString.replace(sub, newSub);        }        return returnString;    }}
原创粉丝点击