JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例

来源:互联网 发布:一个月工资3000 知乎 编辑:程序博客网 时间:2024/05/30 04:30
以下是Java对几种文本文件内容读取代码。其中,OFFICE文档(WORD,EXCEL)使用了POI控件,PDF使用了PDFBOX控件。  点击这里 查看相关控件的下载地址和配置方法。WORDJava代码  收藏代码    package textReader;      import java.io.*;      import org.apache.poi.hwpf.extractor.WordExtractor;            public class WordReader {          public WordReader(){          }          /**          * @param filePath 文件路径          * @return 读出的Word的内容          */          public String getTextFromWord(String filePath){              String result = null;              File file = new File(filePath);              try{                  FileInputStream fis = new FileInputStream(file);                  WordExtractor wordExtractor = new WordExtractor(fis);                  result = wordExtractor.getText();              }catch(FileNotFoundException e){                  e.printStackTrace();              }catch(IOException e){                  e.printStackTrace();              };              return result;          }      }  EXCEL Java代码  收藏代码    package textReader;      import org.apache.poi.hssf.usermodel.HSSFWorkbook;      import org.apache.poi.hssf.usermodel.HSSFSheet;      import org.apache.poi.hssf.usermodel.HSSFRow;      import org.apache.poi.hssf.usermodel.HSSFCell;            import java.io.FileInputStream;      import java.io.FileNotFoundException;      import java.io.IOException;            public class ExcelReader {            @SuppressWarnings("deprecation")      /**      * @param filePath 文件路径      * @return 读出的Excel的内容     */      public String getTextFromExcel(String filePath) {          StringBuffer buff = new StringBuffer();          try {          //创建对Excel工作簿文件的引用          HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(filePath));          //创建对工作表的引用。                  for (int numSheets = 0; numSheets < wb.getNumberOfSheets();               numSheets++) {              if (null != wb.getSheetAt(numSheets)) {                  HSSFSheet aSheet = wb.getSheetAt(numSheets);                              for (int rowNumOfSheet = 0; rowNumOfSheet <=                                    aSheet.getLastRowNum(); rowNumOfSheet++) {                      if (null != aSheet.getRow(rowNumOfSheet)) {                      HSSFRow aRow = aSheet.getRow(rowNumOfSheet);                                                  for (int cellNumOfRow = 0; cellNumOfRow <=                               aRow.getLastCellNum(); cellNumOfRow++) {                          if (null != aRow.getCell(cellNumOfRow)) {                          HSSFCell aCell = aRow.getCell(cellNumOfRow);                                                              switch(aCell.getCellType()){                          case HSSFCell.CELL_TYPE_FORMULA:                                  break;                           case HSSFCell.CELL_TYPE_NUMERIC:                                  buff.append(aCell.getNumericCellValue()).append('\t');                                                      break;                          case HSSFCell.CELL_TYPE_STRING:                                  buff.append(aCell.getStringCellValue()).append('\t');                                                      break;                                                                                        }                                                 }                                                                                                          }                          buff.append('\n');                          }                      }                                                    }                         }              } catch (FileNotFoundException e) {                  e.printStackTrace();              } catch (IOException e) {                  e.printStackTrace();              }              return buff.toString();          }                  }<span style="white-space: normal;"> </span>  PDFJava代码  收藏代码    package textReader;      import java.io.FileInputStream;      import java.io.FileNotFoundException;      import java.io.IOException;            import org.pdfbox.pdfparser.PDFParser;      import org.pdfbox.pdmodel.PDDocument;      import org.pdfbox.util.PDFTextStripper;                  public class PdfReader {          public PdfReader(){          }          /**          * @param filePath 文件路径          * @return 读出的pdf的内容          */          public String getTextFromPdf(String filePath) {              String result = null;              FileInputStream is = null;              PDDocument document = null;              try {                  is = new FileInputStream(filePath);                  PDFParser parser = new PDFParser(is);                  parser.parse();                  document = parser.getPDDocument();                  PDFTextStripper stripper = new PDFTextStripper();                  result = stripper.getText(document);              } catch (FileNotFoundException e) {                  e.printStackTrace();              } catch (IOException e) {                  e.printStackTrace();              } finally {                  if (is != null) {                      try {is.close();}catch(IOException e){e.printStackTrace();}                  }                  if (document != null) {                      try{document.close();}catch (IOException e){e.printStackTrace();}                  }              }              return result;          }            }     TXTJava代码  收藏代码    package textReader;      import java.io.*;                  public class TxtReader {          public TxtReader() {                  }          /**          * @param filePath 文件路径          * @return 读出的txt的内容          */          public String getTextFromTxt(String filePath) throws Exception {                            FileReader fr = new FileReader(filePath);              BufferedReader br = new BufferedReader(fr);              StringBuffer buff = new StringBuffer();              String temp = null;              while((temp = br.readLine()) != null){                  buff.append(temp + "\r\n");              }              br.close();                   return buff.toString();               }      }  RTFJava代码  收藏代码    package textReader;      import java.io.File;      import java.io.FileInputStream;      import java.io.IOException;      import java.io.InputStream;            import javax.swing.text.BadLocationException;      import javax.swing.text.DefaultStyledDocument;      import javax.swing.text.rtf.RTFEditorKit;                  public class RtfReader {          public RtfReader(){          }          /**          * @param filePath 文件路径          * @return 读出的rtf的内容          */          public String getTextFromRtf(String filePath) {              String result = null;              File file = new File(filePath);              try {                         DefaultStyledDocument styledDoc = new DefaultStyledDocument();                  InputStream is = new FileInputStream(file);                  new RTFEditorKit().read(is, styledDoc, 0);                  result = new String(styledDoc.getText(0,styledDoc.getLength()).                                          getBytes("ISO8859_1"));                  //提取文本,读取中文需要使用ISO8859_1编码,否则会出现乱码              } catch (IOException e) {                  e.printStackTrace();              } catch (BadLocationException e) {                  e.printStackTrace();              }              return result;          }         }  HTMLJava代码  收藏代码    package textReader;      import java.io.*;            public class HtmlReader {          public HtmlReader() {          }          /**          * @param filePath 文件路径          * @return 获得html的全部内容          */          public String readHtml(String filePath) {              BufferedReader br=null;              StringBuffer sb = new  StringBuffer();              try {                  br=new BufferedReader(new InputStreamReader(                                           new FileInputStream(filePath),  "GB2312"));                              String temp=null;                         while((temp=br.readLine())!=null){                      sb.append(temp);                  }                         } catch (FileNotFoundException e) {                  e.printStackTrace();              } catch (IOException e) {                  e.printStackTrace();              }              return sb.toString();          }          /**          * @param filePath 文件路径          * @return 获得的html文本内容          */          public String getTextFromHtml(String filePath) {              //得到body标签中的内容              String str= readHtml(filePath);              StringBuffer buff = new StringBuffer();              int maxindex = str.length() - 1;              int begin = 0;              int end;                          //截取>和<之间的内容              while((begin = str.indexOf('>',begin)) < maxindex){                             end = str.indexOf('<',begin);                  if(end - begin > 1){                      buff.append(str.substring(++begin, end));                                 }                             begin = end+1;              };                    return buff.toString();          }            }   注意 :若使用WPS编辑相关文档,会有错误提示,应避免。错误文本提示如下:WORD Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!EXCEL java.lang.RuntimeException: Expected an EXTERNSHEET record but got (org.apache.poi.hssf.record.SSTRecord)at org.apache.poi.hssf.model.LinkTable.readExtSheetRecord(LinkTable.java:187)at org.apache.poi.hssf.model.LinkTable.<init>(LinkTable.java:163)at org.apache.poi.hssf.model.Workbook.createWorkbook(Workbook.java:199)at org.apache.poi.hssf.usermodel.HSSFWorkbook.<init>(HSSFWorkbook.java:273)at org.apache.poi.hssf.usermodel.HSSFWorkbook.<init>(HSSFWorkbook.java:196)at org.apache.poi.hssf.usermodel.HSSFWorkbook.<init>(HSSFWorkbook.java:312)at org.apache.poi.hssf.usermodel.HSSFWorkbook.<init>(HSSFWorkbook.java:293)at textReader.ExcelReader.getTextFromExcel(ExcelReader.java:23)at DocumentInfo.getContent(DocumentInfo.java:86)at MainFunction.main(MainFunction.java:19)RTF java.io.IOException: Too many close-groups in RTF textat javax.swing.text.rtf.RTFParser.write(Unknown Source)at javax.swing.text.rtf.RTFParser.writeSpecial(Unknown Source)at javax.swing.text.rtf.AbstractFilter.write(Unknown Source)at javax.swing.text.rtf.AbstractFilter.readFromStream(Unknown Source)at javax.swing.text.rtf.RTFEditorKit.read(Unknown Source)at textReader.RtfReader.getTextFromRtf(RtfReader.java:25)at DocumentInfo.getContent(DocumentInfo.java:74)at MainFunction.main(MainFunction.java:19) 顺便说一下,这里为什么会在write出错呢?因为 level是根据{和}来进行自增和自减的,当括号不匹配的时候就会提示该错误。 wps编辑rtf文件在格式上出了问题,{和}不匹配。 但用word或写字板下新建文件,编辑后另存为rtf文件(wps下不支持),用记事本打开可以发现添加了很多格式说明,但是{和}是匹配的,这样才不会报错。下面是具体说明:http://www.chinaitpower.com/source/jdk122/javax/swing/text/rtf/RTFParser.java.html (完)


0 0
原创粉丝点击