poi读取doc、ppt、pptx、xsl、xslx文件的内容，pdfbox读取pdf内容，读取txt文件内容

来源：互联网发布：linux下增加mysql密码编辑：程序博客网时间：2024/05/17 07:03

poi

<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.10-FINAL</version>

</dependency>

pdfbox

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.4</version>
</dependency>

public class FileUtils {    private static POIFSFileSystem fs;    private static HSSFWorkbook wb;    private static HSSFSheet sheet;    private static HSSFRow row;    /**     * 读取Excel表格表头的内容     * @param     * @return String 表头内容的数组     */    public static String[] readExcelTitle(String filePath) {        InputStream is = null;        try {            is = new FileInputStream(filePath);            fs = new POIFSFileSystem(is);            wb = new HSSFWorkbook(fs);        } catch (IOException e) {            e.printStackTrace();        }        sheet = wb.getSheetAt(0);        row = sheet.getRow(0);        // 标题总列数        int colNum = row.getPhysicalNumberOfCells();        System.out.println("colNum:" + colNum);        String[] title = new String[colNum];        for (int i = 0; i < colNum; i++) {            //title[i] = getStringCellValue(row.getCell((short) i));            title[i] = getCellFormatValue(row.getCell((short) i));        }        return title;    }    /**     * 读取Excel数据内容     * @param     * @return Map 包含单元格数据内容的Map对象     */    public static Map readExcelContent(String filePath) {        InputStream is = null;        Map content = new HashMap();        String str = "";        try {            is = new FileInputStream(filePath);            fs = new POIFSFileSystem(is);            wb = new HSSFWorkbook(fs);        } catch (IOException e) {            e.printStackTrace();        }        sheet = wb.getSheetAt(0);        // 得到总行数        int rowNum = sheet.getLastRowNum();        row = sheet.getRow(0);        int colNum = row.getPhysicalNumberOfCells();        // 正文内容应该从第二行开始,第一行为表头的标题        for (int i = 1; i <= rowNum; i++) {            row = sheet.getRow(i);            int j = 0;            while (j < colNum) {                // 每个单元格的数据内容用"-"分割开，以后需要时用String类的replace()方法还原数据                // 也可以将每个单元格的数据设置到一个javabean的属性中，此时需要新建一个javabean                // str += getStringCellValue(row.getCell((short) j)).trim() +                // "-";                str += getCellFormatValue(row.getCell((short) j)).trim() + "    ";                j++;            }            content.put(i, str);            str = "";        }        return content;    }    /**     * 根据HSSFCell类型设置数据     * @param cell     * @return     */    private static String getCellFormatValue(HSSFCell cell) {        String cellvalue = "";        if (cell != null) {            // 判断当前Cell的Type            switch (cell.getCellType()) {                // 如果当前Cell的Type为NUMERIC                case HSSFCell.CELL_TYPE_NUMERIC:                case HSSFCell.CELL_TYPE_FORMULA: {                    // 判断当前的cell是否为Date                    if (HSSFDateUtil.isCellDateFormatted(cell)) {                        // 如果是Date类型则，转化为Data格式                        //方法1：这样子的data格式是带时分秒的：2011-10-12 0:00:00                        //cellvalue = cell.getDateCellValue().toLocaleString();                        //方法2：这样子的data格式是不带带时分秒的：2011-10-12                        Date date = cell.getDateCellValue();                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");                        cellvalue = sdf.format(date);                    }                    // 如果是纯数字                    else {                        // 取得当前Cell的数值                        cellvalue = String.valueOf(cell.getNumericCellValue());                    }                    break;                }                // 如果当前Cell的Type为STRIN                case HSSFCell.CELL_TYPE_STRING:                    // 取得当前的Cell字符串                    cellvalue = cell.getRichStringCellValue().getString();                    break;                // 默认的Cell值                default:                    cellvalue = " ";            }        } else {            cellvalue = "";        }        return cellvalue;    }    public static String readPPT(String filePath) {            String str = "";            InputStream is = null;//            PowerPointExtractor extractor = null;            try {                is = new FileInputStream(filePath);                SlideShow ss=new SlideShow(new HSLFSlideShow(is));                Slide[] slides=ss.getSlides();                for(int i=0;i paras = doc.getParagraphs();            for (XWPFParagraph para : paras) {                //当前段落的属性               str = str+para.getText();            }            //获取文档中所有的表格            List tables = doc.getTables();            List rows;            List cells;            for (XWPFTable table : tables) {                //获取表格对应的行                rows = table.getRows();                for (XWPFTableRow row : rows) {                    //获取行对应的单元格                    cells = row.getTableCells();                    for (XWPFTableCell cell : cells) {                        str = str+cell.getText();                    }                }            }            fis.close();        } catch (Exception e) {            e.printStackTrace();        }        return str;    }    public static String readXls(String filePath){        Map content = readExcelContent(filePath);        String[] title  = readExcelTitle(filePath);        String str = "";        for (int index = 0; index < title.length; index++)        {            str += title[index];        }        Iterator> iterator = content.entrySet().iterator();        while(iterator.hasNext()){            Map.Entry map = iterator.next();            String value = map.getValue();            str = str + value;        }        return str;    }    public static String readXlsx(String filePath ) {        String str = "";        try{            InputStream is = new FileInputStream(filePath);            // 构造 XSSFWorkbook 对象，strPath 传入文件路径            XSSFWorkbook xwb = new XSSFWorkbook(is);            // 读取第一章表格内容            XSSFSheet sheet = xwb.getSheetAt(0);            // 定义 row、cell            XSSFRow row;            String cell;            // 循环输出表格中的内容            for (int i = sheet.getFirstRowNum()+1; i < sheet.getPhysicalNumberOfRows(); i++) {                row = sheet.getRow(i);                for (int j = row.getFirstCellNum(); j < row.getPhysicalNumberOfCells(); j++) {                    // 通过 row.getCell(j).toString() 获取单元格内容，                    cell = row.getCell(j).toString();                    str = str+cell;                }            }        }catch(Exception e) {            e.printStackTrace();            System.out.println("已运行xlRead() : " + e );        }finally {            return str;        }    }

在使用过程中读取ppt内容时原注释代码是一次读取ppt内容再读取某些文件时会报错，每页读取则不会报错，具体原因不明白。

阅读全文

0 0