Java 将Word2003（doc）/Word2007（docx）转Html格式文件

来源：互联网发布：网络电视apk源码编辑：程序博客网时间：2024/06/13 23:24

作为一个有素质的博主，首先上jar包http://download.csdn.net/download/u010782875/10041502

代码实现：

import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.PicturesManager;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.usermodel.PictureType;import org.apache.poi.xwpf.converter.core.BasicURIResolver;import org.apache.poi.xwpf.converter.core.FileImageExtractor;import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.w3c.dom.Document;import javax.xml.parsers.DocumentBuilderFactory;import java.io.*;import java.nio.file.Path;import java.nio.file.Paths;/** * @ClassName:WordToString * @Description: * @author: * @data:2017/10/24 */public class WordToString {    public static void main(String[] args) throws Throwable {        //final String path = "D:\\Test\\xxx.doc";        final String filePath = "D:\\Test\\xxx.docx";        readWordToString(filePath);    }    public static String readWordToString(String filePath) throws Exception{        String str = "";        if (FileNameUtil.isWord2003(filePath)) {//            docToHtml(filePath, "D:\\Test\\Word2003(doc).html");            str = docToHtml(filePath, "D:\\Test\\Word2003(doc).html");//            System.out.print(doc);//            FileNameUtil.StringToFile(doc, "D:\\Test\\xxx.txt");//            FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html");        }        if (FileNameUtil.isWord2007(filePath)) {            str = docxToHtml(filePath, "D:\\Test\\Word2007(docx).html");//            System.out.print(docx);//            FileNameUtil.StringToFile(docx, "D:\\Test\\xxx.txt");//            FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html");        }        return str;    }    /*      * doc转换为html      * docFilename:源word文件路径      * htmlFilename:生成的html文件路径      */    public static String docToHtml(String docFilename, String targetFileName) throws Exception {        final Path imagePath = Paths.get(targetFileName).getParent().resolve("image");        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFilename));        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);        // 保存图片，并返回图片的相对路径        wordToHtmlConverter.setPicturesManager(new PicturesManager() {            @Override            public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) {                try (FileOutputStream out = new FileOutputStream(imagePath.resolve(name).toString())) {                    out.write(content);                } catch (Exception e) {                    e.printStackTrace();                }                return "../tmp/image/" + name;            }        });        wordToHtmlConverter.processDocument(wordDocument);        try {            String str = "";            FileInputStream in = new FileInputStream(targetFileName);            // size  为字串的长度 ，这里一次性读完            int size = in.available();            byte[] buffer = new byte[size];            in.read(buffer);            in.close();            str = new String(buffer, "UTF-8");            return str;        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();            return null;        }    }      /*     * docx转换为html     * sourceFilePath:源word文件路径     * targetFileName:生成的html文件路径     */    public static String docxToHtml(String sourceFilePath, String targetFileName) throws Exception {        String imagePathStr = Paths.get(targetFileName).getParent().resolve("../tmp/image/word/media").toString();        OutputStreamWriter outputStreamWriter = null;        try {            XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFilePath));            XHTMLOptions options = XHTMLOptions.create();            // 存放图片的文件夹            options.setExtractor(new FileImageExtractor(new File(imagePathStr)));            // html中图片的路径            options.URIResolver(new BasicURIResolver("../tmp/image/word/media"));            String str = "";            FileInputStream in = new FileInputStream(targetFileName);            // size  为字串的长度 ，这里一次性读完            int size = in.available();            byte[] buffer = new byte[size];            in.read(buffer);            in.close();            str = new String(buffer, "UTF-8");            return str;        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();            return null;        }    }}

这里将Word转为html格式的字符串返回给前台，可以通过我下面提供的方法使其转换成txt和html文件

import java.io.*;/** * @ClassName:FileNameUtil * @Description: * @author: * @data:2017/10/24 */public class FileNameUtil {    public static boolean isWord2003(String filePath)    {        return filePath.matches("^.+\\.(?i)(doc)$");    }    public static boolean isWord2007(String filePath)    {        return filePath.matches("^.+\\.(?i)(docx)$");    }    public static boolean isExcel2003(String filePath)    {        return filePath.matches("^.+\\.(?i)(xls)$");    }    public static boolean isExcel2007(String filePath)    {        return filePath.matches("^.+\\.(?i)(xlsx)$");    }    public static boolean isPDF(String filePath)    {        return filePath.matches("^.+\\.(?i)(pdf)$");    }    /**     * 字符串保存到.txt文件     * @param str     * @param filename     */    public static void StringToFile(String str, String filename)    {        try        {            //创建文件对象            File file = new File(filename);            // 向文件写入对象写入信息            FileWriter fileWriter = new FileWriter(file);            // 写文件            fileWriter.write(str);            // 关闭            fileWriter.close();        }        catch (IOException e)        {            //            e.printStackTrace();        }    }    /**     * .txt文件保存为html文件     * @param filePath     * @param htmlPosition     */    public static void txtToHtml(String filePath, String htmlPosition) {        try {//                        String encoding = "GBK";            String encoding = "UTF-8";            File file = new File(filePath);            if (file.isFile() && file.exists()) { // 判断文件是否存在                InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);                // 考虑到编码格式                BufferedReader bufferedReader = new BufferedReader(read);                // 写文件                FileOutputStream fos = new FileOutputStream(new File(htmlPosition));                OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");                BufferedWriter bw = new BufferedWriter(osw);                String lineTxt = null;                while ((lineTxt = bufferedReader.readLine()) != null) {                    bw.write(lineTxt + "</br>");                }                bw.close();                osw.close();                fos.close();                read.close();            } else {                System.out.println("找不到指定的文件");            }        } catch (Exception e) {            System.out.println("读取文件内容出错");            e.printStackTrace();        }    }}

需要说明的一点，在我导入poi包执行docx文件时会报错，最后通过导入ooxml-schemas-1.1.jar得以解决

阅读全文

0 0