解析大数据量文本

来源：互联网发布：金蝶软件标准版破解编辑：程序博客网时间：2024/05/04 12:47

--------------------------------------------------------------------我不懂什么坚持，只是死撑而已

解析大数据量文本

POI解析xls

public abstract class HxlsAbstract implements HSSFListener {      private int minColumns;      private POIFSFileSystem fs;      private PrintStream output;        private int lastRowNumber;      private int lastColumnNumber;        /** Should we output the formula, or the value it has? */      private boolean outputFormulaValues = true;        /** For parsing Formulas */      private SheetRecordCollectingListener workbookBuildingListener;      private HSSFWorkbook stubWorkbook;        // Records we pick up as we process      private SSTRecord sstRecord;      private FormatTrackingHSSFListener formatListener;        /** So we known which sheet we're on */      private int sheetIndex = -1;      private BoundSheetRecord[] orderedBSRs;      @SuppressWarnings("rawtypes")private ArrayList boundSheetRecords = new ArrayList();        // For handling formulas with string results      private int nextRow;      private int nextColumn;      private boolean outputNextStringRecord;        private int curRow;      private List<String> rowlist;      @SuppressWarnings( "unused")      private String sheetName;          public HxlsAbstract(){    super();     this.output = System.out;           this.minColumns = -1;           this.curRow = 0;           this.rowlist = new ArrayList<String>();      }             //excel记录行操作方法，以行索引和行元素列表为参数，对一行元素进行操作，元素为String类型  //  public abstract void optRows(int curRow, List<String> rowlist) throws SQLException ;            //excel记录行操作方法，以sheet索引，行索引和行元素列表为参数，对sheet的一行元素进行操作，元素为String类型      public abstract void optRows(int sheetIndex,int curRow, List<String> rowlist) throws SQLException;            /**      * 遍历 excel 文件      */      public void process(String filename) throws IOException {      this.fs = new POIFSFileSystem(new FileInputStream(filename));        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);          formatListener = new FormatTrackingHSSFListener(listener);            HSSFEventFactory factory = new HSSFEventFactory();          HSSFRequest request = new HSSFRequest();            if (outputFormulaValues) {              request.addListenerForAllRecords(formatListener);          } else {              workbookBuildingListener = new SheetRecordCollectingListener(formatListener);              request.addListenerForAllRecords(workbookBuildingListener);          }            factory.processWorkbookEvents(request, fs);     }            /**      * HSSFListener 监听方法，处理 Record      */      @SuppressWarnings("unchecked")      public void processRecord(Record record) {          int thisRow = -1;          int thisColumn = -1;          String thisStr = null;          String value = null;                    switch (record.getSid()) {          case BoundSheetRecord.sid:              boundSheetRecords.add(record);              break;          case BOFRecord.sid:              BOFRecord br = (BOFRecord) record;              if (br.getType() == BOFRecord.TYPE_WORKSHEET) {                  // Create sub workbook if required                  if (workbookBuildingListener != null && stubWorkbook == null) {                      stubWorkbook = workbookBuildingListener                              .getStubHSSFWorkbook();                  }                    // Works by ordering the BSRs by the location of                  // their BOFRecords, and then knowing that we                  // process BOFRecords in byte offset order                  sheetIndex++;                  if (orderedBSRs == null) {                      orderedBSRs = BoundSheetRecord                              .orderByBofPosition(boundSheetRecords);                  }                  sheetName = orderedBSRs[sheetIndex].getSheetname();              }              break;            case SSTRecord.sid:              sstRecord = (SSTRecord) record;              break;            case BlankRecord.sid:              BlankRecord brec = (BlankRecord) record;                thisRow = brec.getRow();              thisColumn = brec.getColumn();              thisStr = "";              break;          case BoolErrRecord.sid:              BoolErrRecord berec = (BoolErrRecord) record;                thisRow = berec.getRow();              thisColumn = berec.getColumn();              thisStr = "";              break;            case FormulaRecord.sid:              FormulaRecord frec = (FormulaRecord) record;                thisRow = frec.getRow();              thisColumn = frec.getColumn();                if (outputFormulaValues) {                  if (Double.isNaN(frec.getValue())) {                      // Formula result is a string                      // This is stored in the next record                      outputNextStringRecord = true;                      nextRow = frec.getRow();                      nextColumn = frec.getColumn();                  } else {                      thisStr = formatListener.formatNumberDateCell(frec);                  }              } else {                  thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,                          frec.getParsedExpression()) + '"';              }              break;          case StringRecord.sid:              if (outputNextStringRecord) {                  // String for formula                  StringRecord srec = (StringRecord) record;                  thisStr = srec.getString();                  thisRow = nextRow;                  thisColumn = nextColumn;                  outputNextStringRecord = false;              }              break;            case LabelRecord.sid:              LabelRecord lrec = (LabelRecord) record;                curRow = thisRow = lrec.getRow();              thisColumn = lrec.getColumn();              value = lrec.getValue().trim();              value = value.equals("")?" ":value;              this.rowlist.add(thisColumn, value);              break;          case LabelSSTRecord.sid:              LabelSSTRecord lsrec = (LabelSSTRecord) record;                curRow = thisRow = lsrec.getRow();              thisColumn = lsrec.getColumn();              if (sstRecord == null) {                  rowlist.add(thisColumn, " ");              } else {                  value =  sstRecord                  .getString(lsrec.getSSTIndex()).toString().trim();                  value = value.equals("")?" ":value;                  rowlist.add(thisColumn,value);              }              break;          case NoteRecord.sid:              NoteRecord nrec = (NoteRecord) record;                thisRow = nrec.getRow();              thisColumn = nrec.getColumn();              // TODO: Find object to match nrec.getShapeId()              thisStr = '"' + "(TODO)" + '"';              break;          case NumberRecord.sid:              NumberRecord numrec = (NumberRecord) record;                curRow = thisRow = numrec.getRow();              thisColumn = numrec.getColumn();              value = formatListener.formatNumberDateCell(numrec).trim();              value = value.equals("")?" ":value;              // Format              rowlist.add(thisColumn, value);              break;          case RKRecord.sid:              RKRecord rkrec = (RKRecord) record;                thisRow = rkrec.getRow();              thisColumn = rkrec.getColumn();              thisStr = '"' + "(TODO)" + '"';              break;          default:              break;          }            // 遇到新行的操作          if (thisRow != -1 && thisRow != lastRowNumber) {              lastColumnNumber = -1;          }            // 空值的操作          if (record instanceof MissingCellDummyRecord) {              MissingCellDummyRecord mc = (MissingCellDummyRecord) record;              curRow = thisRow = mc.getRow();              thisColumn = mc.getColumn();              rowlist.add(thisColumn," ");          }            // 如果遇到能打印的东西，在这里打印          if (thisStr != null) {              if (thisColumn > 0) {                  output.print(',');              }              output.print(thisStr);          }            // 更新行和列的值          if (thisRow > -1)              lastRowNumber = thisRow;          if (thisColumn > -1)              lastColumnNumber = thisColumn;            // 行结束时的操作          if (record instanceof LastCellOfRowDummyRecord) {              if (minColumns > 0) {                  // 列值重新置空                  if (lastColumnNumber == -1) {                      lastColumnNumber = 0;                  }              }              // 行结束时， 调用 optRows() 方法              lastColumnNumber = -1;              try {                  optRows(sheetIndex,curRow, rowlist);              } catch (SQLException e) {                  e.printStackTrace();              }              rowlist.clear();          }      }  }

POI解析xlsx

public abstract class HxlsxAbstract extends DefaultHandler {            private SharedStringsTable sst;      private String lastContents;      private boolean nextIsString;       private int sheetIndex = -1;                 private  List<String> rowlist = new ArrayList<String>();      private int curRow = 0;      private int curCol = 0;     /**     * 读取第一个工作簿的入口方法     * @param path     */      public void readOneSheet(String path) throws Exception {          OPCPackage pkg = OPCPackage.open(path);               XSSFReader r = new XSSFReader(pkg);          SharedStringsTable sst = r.getSharedStringsTable();                        XMLReader parser = fetchSheetParser(sst);                        InputStream sheet = r.getSheet("rId1");           InputSource sheetSource = new InputSource(sheet);          parser.parse(sheetSource);                        sheet.close();               }                  /**     * 读取所有工作簿的入口方法     * @param path     * @throws Exception     */      public void process(String path) throws Exception {              OPCPackage pkg = OPCPackage.open(path);          XSSFReader r = new XSSFReader(pkg);          SharedStringsTable sst = r.getSharedStringsTable();           XMLReader parser = fetchSheetParser(sst);           Iterator<InputStream> sheets = r.getSheetsData();          while (sheets.hasNext()) {                       curRow = 0;              sheetIndex++;              InputStream sheet = sheets.next();                         InputSource sheetSource = new InputSource(sheet);              parser.parse(sheetSource);              sheet.close();                     }                     }            /**     * 该方法自动被调用，每读一行调用一次，在方法中写自己的业务逻辑即可     * @param sheetIndex 工作簿序号     * @param curRow 处理到第几行     * @param rowList 当前数据行的数据集合     */      public abstract void optRow(int sheetIndex, int curRow, List<String> rowList);                public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {              XMLReader parser = XMLReaderFactory                  .createXMLReader();          this.sst = sst;          parser.setContentHandler(this);          return parser;      }            public void startElement(String uri, String localName, String name,              Attributes attributes) throws SAXException {          // c => 单元格          if (name.equals("c")) {              // 如果下一个元素是 SST 的索引，则将nextIsString标记为true              String cellType = attributes.getValue("t");              if (cellType != null && cellType.equals("s")) {                  nextIsString = true;              } else {                  nextIsString = false;              }          }          // 置空          lastContents = "";      }                  public void endElement(String uri, String localName, String name)              throws SAXException {          // 根据SST的索引值的到单元格的真正要存储的字符串          // 这时characters()方法可能会被调用多次          if (nextIsString) {              try {                  int idx = Integer.parseInt(lastContents);                  lastContents = new XSSFRichTextString(sst.getEntryAt(idx))                          .toString();              } catch (Exception e) {               }          }           // v => 单元格的值，如果单元格是字符串则v标签的值为该字符串在SST中的索引          // 将单元格内容加入rowlist中，在这之前先去掉字符串前后的空白符          if (name.equals("v")) {              String value = lastContents.trim();              value = value.equals("") ? " " : value;              rowlist.add(curCol, value);              curCol++;          } else {              // 如果标签名称为 row ，这说明已到行尾，调用 optRows() 方法              if (name.equals("row")) {                  optRow(sheetIndex, curRow, rowlist);                  rowlist.clear();                  curRow++;                  curCol = 0;              }          }      }       public void characters(char[] ch, int start, int length)              throws SAXException {          // 得到单元格内容的值          lastContents += new String(ch, start, length);      }   }

解析csv

public abstract class HcsvAbstract {public int lineIndex=-1;public void process(String filename) throws IOException{ InputStream is=new FileInputStream(new File(filename));            BufferedReader reader = new BufferedReader(new InputStreamReader(is,"gbk"));           // reader.readLine();//第一行信息，为标题信息，不用,如果需要，注释掉             String line = null;              while((line=reader.readLine())!=null){              lineIndex++;                String item[] = line.split(",");//CSV格式文件为逗号分隔符文件，这里根据逗号切分                 List<String> lineList=new ArrayList<String>();                for(int i=0;i<item.length;i++){                lineList.add(item[i]);                }               optRows(lineIndex, lineList);                //int value = Integer.parseInt(last);//如果是数值，可以转化为数值                               }         }public abstract void optRows(int lineIndex, List<String> lineList);}

以上为3个抽象类，当需要解析文本时，继承对应的抽象类，重写里面的抽象方法 optRows。此方法每次读取一行自动调用一次，参数包括读取的文件信息，可以用来处理业务逻辑。读取文件时，调用父类process（String filePath）方法,参数是文件地址。

0 0