将Doc或者Docx文档处理成html的代码逻辑;统计word中的字数,段数,句数,读取word中文档内容的代码逻辑

来源:互联网 发布:厦门淘宝运营培训 编辑:程序博客网 时间:2024/04/28 20:32

将Doc或者Docx文档处理成html的代码逻辑

下面是maven的配置代码:

<!-- 文档处理所需的jar的依赖 --><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version></dependency><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-examples</artifactId>    <version>3.9</version></dependency><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-scratchpad</artifactId>    <version>3.9</version></dependency><dependency>    <groupId>fr.opensagres.xdocreport</groupId>    <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>    <version>1.0.4</version></dependency><dependency>    <groupId>fr.opensagres.xdocreport</groupId>    <artifactId>org.apache.poi.xwpf.converter.core</artifactId>    <version>1.0.4</version></dependency><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-ooxml</artifactId>    <version>3.9</version></dependency><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi</artifactId>    <version>3.9</version></dependency><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>poi-ooxml-schemas</artifactId>    <version>3.9</version></dependency><dependency>    <groupId>org.apache.xmlbeans</groupId>    <artifactId>xmlbeans</artifactId>    <version>2.3.0</version></dependency><dependency>    <groupId>org.apache.poi</groupId>    <artifactId>ooxml-schemas</artifactId>    <version>1.1</version></dependency><!-- 文档处理所需的jar的依赖 -->


将word处理成html的代码:

import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.parsers.ParserConfigurationException;import javax.xml.transform.OutputKeys;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerException;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import org.apache.commons.io.FileUtils;import org.apache.commons.io.IOUtils;import org.apache.commons.io.output.ByteArrayOutputStream;import org.apache.commons.lang.StringUtils;import org.apache.log4j.Logger;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.PicturesManager;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hwpf.usermodel.Picture;import org.apache.poi.hwpf.usermodel.PictureType;import org.apache.poi.xwpf.converter.core.BasicURIResolver;import org.apache.poi.xwpf.converter.core.FileImageExtractor;import org.apache.poi.xwpf.converter.core.FileURIResolver;import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.apache.poi.xwpf.usermodel.XWPFParagraph;import org.apache.poi.xwpf.usermodel.XWPFTable;import org.apache.poi.xwpf.usermodel.XWPFTableCell;import org.apache.poi.xwpf.usermodel.XWPFTableRow;import org.w3c.dom.Document;import com.sun.org.apache.xalan.internal.xsltc.compiler.Template;import cn.com.hbny.docdetection.entity.ResourcesWord;import cn.com.hbny.docdetection.server.ExtendedServerConfig;import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType;/** * @brief ReadWordUtils.java 文档处理对应的工具类 * @attention * @author toto * @date 2017年3月3日 * @note begin modify by 涂作权  2017年3月3日  原始创建 */public final class ReadWordUtils {private static Logger logger = Logger.getLogger(ReadWordUtils.class);protected static final String CHARSET_UTF8 = "UTF-8";private static String  tempImagePath = "";/** * 读取docx * @throws Exception */public static ResourcesWord readDocx(String path) throws Exception {int paragNum = 0; // 段落的个数int sentenceNum = 0; // 句子个数int wordNum = 0; // 字体个数StringBuffer content = new StringBuffer();ResourcesWord resourcesWord = new ResourcesWord();InputStream is = new FileInputStream(path);XWPFDocument doc = new XWPFDocument(is);List<XWPFParagraph> paras = doc.getParagraphs();for (XWPFParagraph para : paras) {// 当前段落的属性if (!StringUtils.isEmpty(para.getText())) {paragNum++;sentenceNum += para.getText().replace("\r\n", "").trim().split("。").length;content.append(para.getText());}}// 获取文档中所有的表格List<XWPFTable> tables = doc.getTables();List<XWPFTableRow> rows;List<XWPFTableCell> cells;for (XWPFTable table : tables) {// 表格属性// 获取表格对应的行rows = table.getRows();for (XWPFTableRow row : rows) {// 获取行对应的单元格cells = row.getTableCells();for (XWPFTableCell cell : cells) {content.append(cell.getText());}}/* * MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs = * new BasicDBObject(); dbs.put("name", "创新性"); //分类 * dbs.put("major", "医疗"); //专业 dbs.put("content", * content.toString().trim()); dbs.put("paragNum", paragNum); * dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum); * mongoDb.insert(dbs, "javadb"); */}// 得到全部内容的字数wordNum += content.toString().trim().length();resourcesWord.setContent(content.toString());resourcesWord.setParagNum(paragNum);resourcesWord.setSentenceNum(sentenceNum);resourcesWord.setWordNum(wordNum);close(is);return resourcesWord;}/** * 读取doc文件的内容 *  * @throws IOException */public static ResourcesWord readDoc(String path) throws IOException {int paragNum = 0; // 段落的个数int sentenceNum = 0; // 句子个数int wordNum = 0; // 字体个数ResourcesWord resourcesWord = new ResourcesWord();StringBuffer content = new StringBuffer();try {File f = new File(path);FileInputStream is = new FileInputStream(f);WordExtractor ex = new WordExtractor(is);// is是WORD文件的InputStreamString[] paragraph = ex.getParagraphText();for (int i = 0; i < paragraph.length; i++) {paragNum++;System.out.println("Paragraph " + (i + 1) + " : " + paragraph[i]);sentenceNum += paragraph[i].replace("\r\n", "").trim().split("。").length;wordNum += paragraph[i].trim().length();content.append(paragraph[i].trim());}System.out.println("段落:" + paragNum);System.out.println("句子:" + sentenceNum);System.out.println("字体:" + wordNum);resourcesWord.setContent(content.toString());resourcesWord.setParagNum(paragNum);resourcesWord.setSentenceNum(sentenceNum);resourcesWord.setWordNum(wordNum);/* * MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs = * new BasicDBObject(); dbs.put("name", "创新性"); //分类 * dbs.put("major", "医疗"); //专业 dbs.put("content", * content.toString()); dbs.put("paragNum", paragNum); * dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum); * mongoDb.insert(dbs, "javadb"); */is.close();} catch (Exception e) {e.printStackTrace();}return resourcesWord;}/** * \brief doc转换成html,并返回输出的相对路径 * @param filePath                  :要转换的doc文档 * @param outPutFilePath                :文档输出的位置 * @attention * @author toto * @throws IOException  * @throws FileNotFoundException  * @throws ParserConfigurationException  * @date 2017年2月27日  * @note  begin modify by 涂作权  2017年2月27日   原始创建 */public static String doc2Html(String filePath,final String outPutFilePath)throws TransformerException, IOException, ParserConfigurationException {        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath));        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(        DocumentBuilderFactory        .newInstance()        .newDocumentBuilder()        .newDocument());                wordToHtmlConverter.setPicturesManager(new PicturesManager() {              public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {            //File file = new File(outPutFilePath);                //String name = file.getName();                tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator;                                File imageFolder = new File(tempImagePath);                if (!imageFolder.exists()) {try {FileUtils.forceMkdir(imageFolder);} catch (IOException e) {e.printStackTrace();}}                String newTempImagePath = imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, "");            return newTempImagePath + File.separator + suggestedName;            }        });        wordToHtmlConverter.processDocument(wordDocument);        // 保存图片        List<Picture> pics = wordDocument.getPicturesTable().getAllPictures();        if (pics != null) {            for (int i = 0; i < pics.size(); i++) {                Picture pic = (Picture) pics.get(i);                try {                File picOutFolder = new File(tempImagePath + File.separator);                if (!picOutFolder.exists()) {picOutFolder.mkdirs();}                    pic.writeImageContent(new FileOutputStream(tempImagePath + File.separator + pic.suggestFullFileName()));                } catch (FileNotFoundException e) {                      e.printStackTrace();                }            }          }          Document htmlDocument = wordToHtmlConverter.getDocument();          ByteArrayOutputStream out = new ByteArrayOutputStream();          DOMSource domSource = new DOMSource(htmlDocument);          StreamResult streamResult = new StreamResult(out);            TransformerFactory tf = TransformerFactory.newInstance();          Transformer serializer = tf.newTransformer();          serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");          serializer.setOutputProperty(OutputKeys.INDENT, "yes");          serializer.setOutputProperty(OutputKeys.METHOD, "html");          serializer.transform(domSource, streamResult);          out.close();          writeFile(new String(out.toByteArray()), outPutFilePath);        return gainRelativePathByOutputPath(outPutFilePath);    }/**       * 将docx格式的word转换为html格式的文档     *        * @param filePath 原始的docx文件路径存储位置     * @param outPutFile html输出文件路径   * @return      * @throws TransformerException     * @throws IOException     * @throws ParserConfigurationException     */    public static String docx2Html(String filePath,final String outPutFilePath) throws TransformerException, IOException, ParserConfigurationException {    //String fileOutName = outPutFile;        XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(filePath));        XHTMLOptions options = XHTMLOptions.create().indent(4);                // 导出图片        Map<String, String> imageInfoMap = gainTempImagePath(outPutFilePath);        File imageFolder = new File(imageInfoMap.get("imageStoredPath"));        options.setExtractor(new FileImageExtractor(imageFolder));        // URI resolver        //这种方式获得word中的图片地址是绝对地址        //options.URIResolver(new FileURIResolver(imageFolder));        //设置生成的html中的img src中的地址是相对路径        options.URIResolver(new BasicURIResolver(imageInfoMap.get("imageFolder")));                File outFile = new File(outPutFilePath);        outFile.getParentFile().mkdirs();        OutputStream out = new FileOutputStream(outFile);        XHTMLConverter.getInstance().convert(wordDocument, out, options);                return gainRelativePathByOutputPath(outPutFilePath);        //System.out.println("Generate " + fileOutName + " with " + (System.currentTimeMillis() - startTime) + " ms.");    }        /** * \brief 将内容写到path路径下面 * @param content            :文档内容 * @param path               :最终的文件存储路径 * @attention 方法的使用注意事项  * @author toto * @date 2017年2月27日  * @note  begin modify by 涂作权 2017年2月27日   修改输出的文件名称 */public static void writeFile(String docContent, String path) {          FileOutputStream outDocFos = null;        try {        //判断文件是否为空的            if (StringUtils.isNotBlank(path)) {            File file = new File(path);            if (!file.exists()) {FileUtils.forceMkdir(file.getParentFile());}            outDocFos = new FileOutputStream(path);IOUtils.write(docContent, outDocFos,CHARSET_UTF8);}        } catch (FileNotFoundException fnfe) {              fnfe.printStackTrace();          } catch (IOException ioe) {              ioe.printStackTrace();          } finally {              try {                  if (outDocFos != null)                  outDocFos.close();              } catch (IOException ie) {              }        }      }/** * 关闭输入流 *  * @param is */private static void close(InputStream is) {if (is != null) {try {is.close();} catch (IOException e) {e.printStackTrace();}}}/** * \brief 通过文档输出路径获得图片存储路径 * @param outPutFile             :文档输出路径 * @return * @attention 方法的使用注意事项  * @author toto * @date 2017年2月28日  * @note  begin modify by 修改人 修改时间   修改内容摘要说明 */private static Map<String, String> gainTempImagePath(String outPutFilePath) {Map<String,String> imageInfoMap = new HashMap<String,String>();try {//File file = new File(outPutFilePath);        tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator;                File imageFolder = new File(tempImagePath);        if (!imageFolder.exists()) {try {FileUtils.forceMkdir(imageFolder);} catch (IOException e) {e.printStackTrace();}}                //System.out.println(imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, ""));        //return imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, "");        imageInfoMap.put("imageStoredPath", imageFolder.getPath());        imageInfoMap.put("imageFolder", imageFolder.getPath().replace(imageFolder.getParentFile().getPath(), "").replace(File.separator, ""));                return imageInfoMap;} catch (Exception e) {e.printStackTrace();}        return null;}private static String gainRelativePathByOutputPath(String outPutFilePath) {//用于预览的存储路径String docsPreviewPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_PREVIEW_PREFIX");return  outPutFilePath.split(docsPreviewPath)[1];}/** * \brief    * @param orgStr            :表示要替换的就得字符串 * @param regEx             :表示的是正则表达式 * @param targetStr         :表示要替换的字符串 * @return * @attention 方法的使用注意事项  * @author toto * @date 2017年3月4日  * @note  begin modify by 涂作权  原始创建  2017年3月4日 */public static String replaceStr(String orgStr,String regEx,String targetStr){    if (null !=orgStr && !"".equals(orgStr.trim())) {        //String regEx="[\\s~·`!!@#¥$%^……&*(())\\-——\\-_=+【\\[\\]】{{}}\\|、\\\\;;::‘'“”\",,《<。.》>、/??]";        Pattern p = Pattern.compile(regEx);        Matcher m = p.matcher(orgStr);        return m.replaceAll(targetStr);    }    return null;}public static void main(String[] args) throws Exception {//String uploadFile = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH");//String docsTempPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_TEMP_PATH");//String docsOutputPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_OUTPUT_PATH");//System.out.println("uploadFile = " + uploadFile + "  " + docsTempPath + "  " + docsOutputPath);//// Testtest.readWord("E://111.doc");// Testtest.readDoc();// System.out.println(content);//ResourcesWord readDocx = ReadWordUtils.readDoc(uploadFile + "/大学生创新创业项目申报书.doc");//logger.info(readDocx.getContent());//logger.info(readDocx.getParagNum());////new ReadWordUtils().doc2Html(uploadFile + "/大学生创新创业项目申报书.doc" , docsOutputPath + "/大学生创新创业项目申报书.html");//new ReadWordUtils().docx2Html(uploadFile + "/大学生创新创业项目申报书副本.docx" , docsOutputPath + "/大学生创新创业项目申报书副本.html");    String newStr = replaceStr("afdas//\\as   dfasd     a//asd\\\\\\asd\\/", "[\\\\]","/");    newStr = replaceStr(newStr, "(/){1,}", "/");    newStr = replaceStr(newStr, "[ ]", "");        System.out.println(newStr);}}


下面是调用案例:

import java.io.File;import org.apache.log4j.Logger;import org.springframework.stereotype.Service;import cn.com.hbny.docdetection.mongodb.beans.DocInfo;import cn.com.hbny.docdetection.server.ExtendedServerConfig;import cn.com.hbny.docdetection.service.base.impl.BaseServiceImpl;import cn.com.hbny.docdetection.service.docInfoHandler.DocInfoHandlerService;import cn.com.hbny.docdetection.utils.Pinyin4jUtils;import cn.com.hbny.docdetection.utils.ReadWordUtils;import cn.com.hbny.docdetection.utils.UUIDGenerator;import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType;/** * @brief DocInfoHandlerServiceImpl.java 文档检测对应的文档 * @attention * @author toto * @date 2017年3月2日 * @note begin modify by 涂作权   2017年3月2日  原始创建 */@Service(value = "docInfoHandlerService")public class DocInfoHandlerServiceImpl extends BaseServiceImpl implements DocInfoHandlerService {private static Logger logger = Logger.getLogger(DocInfoHandlerServiceImpl.class);/** * 文档处理对应的service * @param docLibrayId       :文档库对应的id * @param originalDocPath   :原始文档所在的位置 * @param uploadPath        :文档上传路径 * @param outPutFolderPath  :文档最终的输出文件夹 * @param docsPreviewPrefix :文档预览的前缀 */public DocInfo handlerSingleDocInfo(String docLibrayId,String originalDocPath,String uploadPath,String outPutFolderPath,String docsPreviewPrefix) {try {DocInfo docInfo = new DocInfo();docInfo.setId(UUIDGenerator.generate());docInfo.setDocLibrayId(docLibrayId);//处理传递过来的文件路径File file = new File(originalDocPath);//判断文件是否哦存在,如果不存在直接返回,如果存在继续下面的操作if (file.exists()) {//获取到文档的名称String fileName = file.getName();docInfo.setOriginalFileName(fileName.substring(0,fileName.toLowerCase().indexOf(".doc")));//截取上传文件的后面那一串路径String fileRelativePath = originalDocPath.substring(uploadPath.length());    docInfo.setOriginalDocPath(fileRelativePath);//判断文件后缀if (fileName.endsWith(".doc")) {//1、处理word文档,并将word文档存储在相应的位置上,将word存储成htmlString outPutFilePath = Pinyin4jUtils.toPinYin(outPutFolderPath + fileRelativePath.replace(".doc", ".html"),PinyinType.LOWERCASE);outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", "");//下面是经过处理后的文件存储位置                    String filePathAfterHandled = ReadWordUtils.doc2Html(originalDocPath,outPutFilePath);                    docInfo.setHtmlDocPath(filePathAfterHandled);}  else {//1、处理word文档,并将word文档存储在相应的位置上,将word存储成html//1、处理word文档,并将word文档存储在相应的位置上,将word存储成htmlString outPutFilePath = Pinyin4jUtils.toPinYin(outPutFolderPath + fileRelativePath.replace(".docx", ".html"),PinyinType.LOWERCASE);outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", "");//下面是经过处理后的文件存储位置                    String filePathAfterHandled = ReadWordUtils.docx2Html(originalDocPath, outPutFilePath);                    docInfo.setHtmlDocPath(filePathAfterHandled);}return null;} else {return null;}} catch (Exception e) {e.printStackTrace();}return null;}public static void main(String[] args) {String uploadPath = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH");String outPutFolderPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_OUTPUT_PATH");String docsPreviewPrefix = ExtendedServerConfig.getInstance().getStringProperty("DOCS_PREVIEW_PREFIX");//new DocInfoHandlerServiceImpl().handlerSingleDocInfo(//UUIDGenerator.generate(), //uploadPath + "/双创项目申报书20170301/国家大学生创新训练计划项目申请书华师大.doc",//uploadPath,//outPutFolderPath);//new DocInfoHandlerServiceImpl().handlerSingleDocInfo(//UUIDGenerator.generate(), //uploadPath + "/双创项目申报书20170301/国家级大学生创新创业训练计划  立项申请书   上海电力学院.doc",//uploadPath,//outPutFolderPath,//docsPreviewPrefix);new DocInfoHandlerServiceImpl().handlerSingleDocInfo(UUIDGenerator.generate(), uploadPath + "/双创项目申报书20170301/专题产品需求规格说明书.docx",uploadPath,outPutFolderPath,docsPreviewPrefix);}}

下面是所以用到的参数配置:

#上传的文件的存储位置的配置,统一的最后面不要加斜杠UPLOAD_PATH=D:/installed/apache-tomcat-7.0.47/webapps/upload##处理后的文档输出位置,统一的最后面不要加斜杠DOCS_OUTPUT_PATH=D:/installed/apache-tomcat-7.0.47/webapps/docs-output-path##文档预览路径,注意最后面不要加斜杠DOCS_PREVIEW_PREFIX=/docs-output-path##处理文档是,生成的一些图片的临时存储路径,最后面不要加斜杠DOCS_TEMP_PATH=D:/installed/apache-tomcat-7.0.47/webapps/temp


0 0
原创粉丝点击