将Doc或者Docx文档处理成html的代码逻辑;统计word中的字数,段数,句数,读取word中文档内容的代码逻辑
来源:互联网 发布:厦门淘宝运营培训 编辑:程序博客网 时间:2024/04/28 20:32
将Doc或者Docx文档处理成html的代码逻辑
下面是maven的配置代码:
<!-- 文档处理所需的jar的依赖 --><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-examples</artifactId> <version>3.9</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.9</version></dependency><dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.4</version></dependency><dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.core</artifactId> <version>1.0.4</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.9</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.9</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.9</version></dependency><dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.3.0</version></dependency><dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version></dependency><!-- 文档处理所需的jar的依赖 -->
将word处理成html的代码:
import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.parsers.ParserConfigurationException;import javax.xml.transform.OutputKeys;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerException;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import org.apache.commons.io.FileUtils;import org.apache.commons.io.IOUtils;import org.apache.commons.io.output.ByteArrayOutputStream;import org.apache.commons.lang.StringUtils;import org.apache.log4j.Logger;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.converter.PicturesManager;import org.apache.poi.hwpf.converter.WordToHtmlConverter;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hwpf.usermodel.Picture;import org.apache.poi.hwpf.usermodel.PictureType;import org.apache.poi.xwpf.converter.core.BasicURIResolver;import org.apache.poi.xwpf.converter.core.FileImageExtractor;import org.apache.poi.xwpf.converter.core.FileURIResolver;import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.apache.poi.xwpf.usermodel.XWPFParagraph;import org.apache.poi.xwpf.usermodel.XWPFTable;import org.apache.poi.xwpf.usermodel.XWPFTableCell;import org.apache.poi.xwpf.usermodel.XWPFTableRow;import org.w3c.dom.Document;import com.sun.org.apache.xalan.internal.xsltc.compiler.Template;import cn.com.hbny.docdetection.entity.ResourcesWord;import cn.com.hbny.docdetection.server.ExtendedServerConfig;import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType;/** * @brief ReadWordUtils.java 文档处理对应的工具类 * @attention * @author toto * @date 2017年3月3日 * @note begin modify by 涂作权 2017年3月3日 原始创建 */public final class ReadWordUtils {private static Logger logger = Logger.getLogger(ReadWordUtils.class);protected static final String CHARSET_UTF8 = "UTF-8";private static String tempImagePath = "";/** * 读取docx * @throws Exception */public static ResourcesWord readDocx(String path) throws Exception {int paragNum = 0; // 段落的个数int sentenceNum = 0; // 句子个数int wordNum = 0; // 字体个数StringBuffer content = new StringBuffer();ResourcesWord resourcesWord = new ResourcesWord();InputStream is = new FileInputStream(path);XWPFDocument doc = new XWPFDocument(is);List<XWPFParagraph> paras = doc.getParagraphs();for (XWPFParagraph para : paras) {// 当前段落的属性if (!StringUtils.isEmpty(para.getText())) {paragNum++;sentenceNum += para.getText().replace("\r\n", "").trim().split("。").length;content.append(para.getText());}}// 获取文档中所有的表格List<XWPFTable> tables = doc.getTables();List<XWPFTableRow> rows;List<XWPFTableCell> cells;for (XWPFTable table : tables) {// 表格属性// 获取表格对应的行rows = table.getRows();for (XWPFTableRow row : rows) {// 获取行对应的单元格cells = row.getTableCells();for (XWPFTableCell cell : cells) {content.append(cell.getText());}}/* * MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs = * new BasicDBObject(); dbs.put("name", "创新性"); //分类 * dbs.put("major", "医疗"); //专业 dbs.put("content", * content.toString().trim()); dbs.put("paragNum", paragNum); * dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum); * mongoDb.insert(dbs, "javadb"); */}// 得到全部内容的字数wordNum += content.toString().trim().length();resourcesWord.setContent(content.toString());resourcesWord.setParagNum(paragNum);resourcesWord.setSentenceNum(sentenceNum);resourcesWord.setWordNum(wordNum);close(is);return resourcesWord;}/** * 读取doc文件的内容 * * @throws IOException */public static ResourcesWord readDoc(String path) throws IOException {int paragNum = 0; // 段落的个数int sentenceNum = 0; // 句子个数int wordNum = 0; // 字体个数ResourcesWord resourcesWord = new ResourcesWord();StringBuffer content = new StringBuffer();try {File f = new File(path);FileInputStream is = new FileInputStream(f);WordExtractor ex = new WordExtractor(is);// is是WORD文件的InputStreamString[] paragraph = ex.getParagraphText();for (int i = 0; i < paragraph.length; i++) {paragNum++;System.out.println("Paragraph " + (i + 1) + " : " + paragraph[i]);sentenceNum += paragraph[i].replace("\r\n", "").trim().split("。").length;wordNum += paragraph[i].trim().length();content.append(paragraph[i].trim());}System.out.println("段落:" + paragNum);System.out.println("句子:" + sentenceNum);System.out.println("字体:" + wordNum);resourcesWord.setContent(content.toString());resourcesWord.setParagNum(paragNum);resourcesWord.setSentenceNum(sentenceNum);resourcesWord.setWordNum(wordNum);/* * MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs = * new BasicDBObject(); dbs.put("name", "创新性"); //分类 * dbs.put("major", "医疗"); //专业 dbs.put("content", * content.toString()); dbs.put("paragNum", paragNum); * dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum); * mongoDb.insert(dbs, "javadb"); */is.close();} catch (Exception e) {e.printStackTrace();}return resourcesWord;}/** * \brief doc转换成html,并返回输出的相对路径 * @param filePath :要转换的doc文档 * @param outPutFilePath :文档输出的位置 * @attention * @author toto * @throws IOException * @throws FileNotFoundException * @throws ParserConfigurationException * @date 2017年2月27日 * @note begin modify by 涂作权 2017年2月27日 原始创建 */public static String doc2Html(String filePath,final String outPutFilePath)throws TransformerException, IOException, ParserConfigurationException { HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory .newInstance() .newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { //File file = new File(outPutFilePath); //String name = file.getName(); tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator; File imageFolder = new File(tempImagePath); if (!imageFolder.exists()) {try {FileUtils.forceMkdir(imageFolder);} catch (IOException e) {e.printStackTrace();}} String newTempImagePath = imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, ""); return newTempImagePath + File.separator + suggestedName; } }); wordToHtmlConverter.processDocument(wordDocument); // 保存图片 List<Picture> pics = wordDocument.getPicturesTable().getAllPictures(); if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); try { File picOutFolder = new File(tempImagePath + File.separator); if (!picOutFolder.exists()) {picOutFolder.mkdirs();} pic.writeImageContent(new FileOutputStream(tempImagePath + File.separator + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); writeFile(new String(out.toByteArray()), outPutFilePath); return gainRelativePathByOutputPath(outPutFilePath); }/** * 将docx格式的word转换为html格式的文档 * * @param filePath 原始的docx文件路径存储位置 * @param outPutFile html输出文件路径 * @return * @throws TransformerException * @throws IOException * @throws ParserConfigurationException */ public static String docx2Html(String filePath,final String outPutFilePath) throws TransformerException, IOException, ParserConfigurationException { //String fileOutName = outPutFile; XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(filePath)); XHTMLOptions options = XHTMLOptions.create().indent(4); // 导出图片 Map<String, String> imageInfoMap = gainTempImagePath(outPutFilePath); File imageFolder = new File(imageInfoMap.get("imageStoredPath")); options.setExtractor(new FileImageExtractor(imageFolder)); // URI resolver //这种方式获得word中的图片地址是绝对地址 //options.URIResolver(new FileURIResolver(imageFolder)); //设置生成的html中的img src中的地址是相对路径 options.URIResolver(new BasicURIResolver(imageInfoMap.get("imageFolder"))); File outFile = new File(outPutFilePath); outFile.getParentFile().mkdirs(); OutputStream out = new FileOutputStream(outFile); XHTMLConverter.getInstance().convert(wordDocument, out, options); return gainRelativePathByOutputPath(outPutFilePath); //System.out.println("Generate " + fileOutName + " with " + (System.currentTimeMillis() - startTime) + " ms."); } /** * \brief 将内容写到path路径下面 * @param content :文档内容 * @param path :最终的文件存储路径 * @attention 方法的使用注意事项 * @author toto * @date 2017年2月27日 * @note begin modify by 涂作权 2017年2月27日 修改输出的文件名称 */public static void writeFile(String docContent, String path) { FileOutputStream outDocFos = null; try { //判断文件是否为空的 if (StringUtils.isNotBlank(path)) { File file = new File(path); if (!file.exists()) {FileUtils.forceMkdir(file.getParentFile());} outDocFos = new FileOutputStream(path);IOUtils.write(docContent, outDocFos,CHARSET_UTF8);} } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (outDocFos != null) outDocFos.close(); } catch (IOException ie) { } } }/** * 关闭输入流 * * @param is */private static void close(InputStream is) {if (is != null) {try {is.close();} catch (IOException e) {e.printStackTrace();}}}/** * \brief 通过文档输出路径获得图片存储路径 * @param outPutFile :文档输出路径 * @return * @attention 方法的使用注意事项 * @author toto * @date 2017年2月28日 * @note begin modify by 修改人 修改时间 修改内容摘要说明 */private static Map<String, String> gainTempImagePath(String outPutFilePath) {Map<String,String> imageInfoMap = new HashMap<String,String>();try {//File file = new File(outPutFilePath); tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator; File imageFolder = new File(tempImagePath); if (!imageFolder.exists()) {try {FileUtils.forceMkdir(imageFolder);} catch (IOException e) {e.printStackTrace();}} //System.out.println(imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, "")); //return imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, ""); imageInfoMap.put("imageStoredPath", imageFolder.getPath()); imageInfoMap.put("imageFolder", imageFolder.getPath().replace(imageFolder.getParentFile().getPath(), "").replace(File.separator, "")); return imageInfoMap;} catch (Exception e) {e.printStackTrace();} return null;}private static String gainRelativePathByOutputPath(String outPutFilePath) {//用于预览的存储路径String docsPreviewPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_PREVIEW_PREFIX");return outPutFilePath.split(docsPreviewPath)[1];}/** * \brief * @param orgStr :表示要替换的就得字符串 * @param regEx :表示的是正则表达式 * @param targetStr :表示要替换的字符串 * @return * @attention 方法的使用注意事项 * @author toto * @date 2017年3月4日 * @note begin modify by 涂作权 原始创建 2017年3月4日 */public static String replaceStr(String orgStr,String regEx,String targetStr){ if (null !=orgStr && !"".equals(orgStr.trim())) { //String regEx="[\\s~·`!!@#¥$%^……&*(())\\-——\\-_=+【\\[\\]】{{}}\\|、\\\\;;::‘'“”\",,《<。.》>、/??]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(orgStr); return m.replaceAll(targetStr); } return null;}public static void main(String[] args) throws Exception {//String uploadFile = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH");//String docsTempPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_TEMP_PATH");//String docsOutputPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_OUTPUT_PATH");//System.out.println("uploadFile = " + uploadFile + " " + docsTempPath + " " + docsOutputPath);//// Testtest.readWord("E://111.doc");// Testtest.readDoc();// System.out.println(content);//ResourcesWord readDocx = ReadWordUtils.readDoc(uploadFile + "/大学生创新创业项目申报书.doc");//logger.info(readDocx.getContent());//logger.info(readDocx.getParagNum());////new ReadWordUtils().doc2Html(uploadFile + "/大学生创新创业项目申报书.doc" , docsOutputPath + "/大学生创新创业项目申报书.html");//new ReadWordUtils().docx2Html(uploadFile + "/大学生创新创业项目申报书副本.docx" , docsOutputPath + "/大学生创新创业项目申报书副本.html"); String newStr = replaceStr("afdas//\\as dfasd a//asd\\\\\\asd\\/", "[\\\\]","/"); newStr = replaceStr(newStr, "(/){1,}", "/"); newStr = replaceStr(newStr, "[ ]", ""); System.out.println(newStr);}}
下面是调用案例:
import java.io.File;import org.apache.log4j.Logger;import org.springframework.stereotype.Service;import cn.com.hbny.docdetection.mongodb.beans.DocInfo;import cn.com.hbny.docdetection.server.ExtendedServerConfig;import cn.com.hbny.docdetection.service.base.impl.BaseServiceImpl;import cn.com.hbny.docdetection.service.docInfoHandler.DocInfoHandlerService;import cn.com.hbny.docdetection.utils.Pinyin4jUtils;import cn.com.hbny.docdetection.utils.ReadWordUtils;import cn.com.hbny.docdetection.utils.UUIDGenerator;import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType;/** * @brief DocInfoHandlerServiceImpl.java 文档检测对应的文档 * @attention * @author toto * @date 2017年3月2日 * @note begin modify by 涂作权 2017年3月2日 原始创建 */@Service(value = "docInfoHandlerService")public class DocInfoHandlerServiceImpl extends BaseServiceImpl implements DocInfoHandlerService {private static Logger logger = Logger.getLogger(DocInfoHandlerServiceImpl.class);/** * 文档处理对应的service * @param docLibrayId :文档库对应的id * @param originalDocPath :原始文档所在的位置 * @param uploadPath :文档上传路径 * @param outPutFolderPath :文档最终的输出文件夹 * @param docsPreviewPrefix :文档预览的前缀 */public DocInfo handlerSingleDocInfo(String docLibrayId,String originalDocPath,String uploadPath,String outPutFolderPath,String docsPreviewPrefix) {try {DocInfo docInfo = new DocInfo();docInfo.setId(UUIDGenerator.generate());docInfo.setDocLibrayId(docLibrayId);//处理传递过来的文件路径File file = new File(originalDocPath);//判断文件是否哦存在,如果不存在直接返回,如果存在继续下面的操作if (file.exists()) {//获取到文档的名称String fileName = file.getName();docInfo.setOriginalFileName(fileName.substring(0,fileName.toLowerCase().indexOf(".doc")));//截取上传文件的后面那一串路径String fileRelativePath = originalDocPath.substring(uploadPath.length()); docInfo.setOriginalDocPath(fileRelativePath);//判断文件后缀if (fileName.endsWith(".doc")) {//1、处理word文档,并将word文档存储在相应的位置上,将word存储成htmlString outPutFilePath = Pinyin4jUtils.toPinYin(outPutFolderPath + fileRelativePath.replace(".doc", ".html"),PinyinType.LOWERCASE);outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", "");//下面是经过处理后的文件存储位置 String filePathAfterHandled = ReadWordUtils.doc2Html(originalDocPath,outPutFilePath); docInfo.setHtmlDocPath(filePathAfterHandled);} else {//1、处理word文档,并将word文档存储在相应的位置上,将word存储成html//1、处理word文档,并将word文档存储在相应的位置上,将word存储成htmlString outPutFilePath = Pinyin4jUtils.toPinYin(outPutFolderPath + fileRelativePath.replace(".docx", ".html"),PinyinType.LOWERCASE);outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/");outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", "");//下面是经过处理后的文件存储位置 String filePathAfterHandled = ReadWordUtils.docx2Html(originalDocPath, outPutFilePath); docInfo.setHtmlDocPath(filePathAfterHandled);}return null;} else {return null;}} catch (Exception e) {e.printStackTrace();}return null;}public static void main(String[] args) {String uploadPath = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH");String outPutFolderPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_OUTPUT_PATH");String docsPreviewPrefix = ExtendedServerConfig.getInstance().getStringProperty("DOCS_PREVIEW_PREFIX");//new DocInfoHandlerServiceImpl().handlerSingleDocInfo(//UUIDGenerator.generate(), //uploadPath + "/双创项目申报书20170301/国家大学生创新训练计划项目申请书华师大.doc",//uploadPath,//outPutFolderPath);//new DocInfoHandlerServiceImpl().handlerSingleDocInfo(//UUIDGenerator.generate(), //uploadPath + "/双创项目申报书20170301/国家级大学生创新创业训练计划 立项申请书 上海电力学院.doc",//uploadPath,//outPutFolderPath,//docsPreviewPrefix);new DocInfoHandlerServiceImpl().handlerSingleDocInfo(UUIDGenerator.generate(), uploadPath + "/双创项目申报书20170301/专题产品需求规格说明书.docx",uploadPath,outPutFolderPath,docsPreviewPrefix);}}
下面是所以用到的参数配置:
#上传的文件的存储位置的配置,统一的最后面不要加斜杠UPLOAD_PATH=D:/installed/apache-tomcat-7.0.47/webapps/upload##处理后的文档输出位置,统一的最后面不要加斜杠DOCS_OUTPUT_PATH=D:/installed/apache-tomcat-7.0.47/webapps/docs-output-path##文档预览路径,注意最后面不要加斜杠DOCS_PREVIEW_PREFIX=/docs-output-path##处理文档是,生成的一些图片的临时存储路径,最后面不要加斜杠DOCS_TEMP_PATH=D:/installed/apache-tomcat-7.0.47/webapps/temp
0 0
- 将Doc或者Docx文档处理成html的代码逻辑;统计word中的字数,段数,句数,读取word中文档内容的代码逻辑
- 将word文档转化为html(代码)
- 编程读取文档Doc,Docx,Pdf的内容
- 通过代码将Word 2007 template (dotx)文档转换Word 2007 (docx)文档
- java读取word格式.doc或者.docx中的内容(APACHE POI)
- 一个统计Word文档中行数的VBA例程
- ASP.NET将word文档转换成pdf的代码
- asp.net将word文档转换成pdf的代码
- ASP.NET将word文档转换成pdf的代码
- 不启动Word查看文档字数统计的方法
- php读取word\pdf等文档的内容,并将其保存到网页中
- 将数据库的内容生成WORD文档
- java POI word的docx文档中的文字替换,并把docx转成pdf文档
- 将WORD文档转换成为HTML网页文件的C#代码
- 通过ruby代码,将指定的WORD文档转换为HTML:
- python如何处理解析word文档doc docx , python-docx,python-docx2txt,zipfile
- 关于js的callback回调函数的理解----回调函数的处理逻辑理解:所谓的回调函数处理逻辑,其实就是先将回调函数的代码 冻结(或者理解为闲置),接着将这个回调函数的代码放到回调函数管理器的
- 将html内容写入word文档
- C++全局和局部随机洗牌算法
- 最右技术问答的一点个人见解
- error LNK2019: 无法解析的外部符号 _WinMain@16,该符号在函数 ___tmainCRTStartup 中被引用
- docker常用指令简介(干货)
- php设计模式之装饰器
- 将Doc或者Docx文档处理成html的代码逻辑;统计word中的字数,段数,句数,读取word中文档内容的代码逻辑
- 2月英语总结
- rabbitmq学习之路(一)安装以及简单使用
- C语言-婚礼上的谎言
- perl学习笔记——数据类型
- Kubernetes学习笔记(二):网络原理
- hdu 2036 改革春风吹满地【多边形面积】
- ubuntu 查看uuid
- 腾讯面试题-0到9999这1万个数中有多少个数字7