解析大数据量文本
来源:互联网 发布:金蝶软件标准版破解 编辑:程序博客网 时间:2024/05/04 12:47
--------------------------------------------------------------------我不懂什么坚持,只是死撑而已
解析大数据量文本
POI解析xls
public abstract class HxlsAbstract implements HSSFListener { private int minColumns; private POIFSFileSystem fs; private PrintStream output; private int lastRowNumber; private int lastColumnNumber; /** Should we output the formula, or the value it has? */ private boolean outputFormulaValues = true; /** For parsing Formulas */ private SheetRecordCollectingListener workbookBuildingListener; private HSSFWorkbook stubWorkbook; // Records we pick up as we process private SSTRecord sstRecord; private FormatTrackingHSSFListener formatListener; /** So we known which sheet we're on */ private int sheetIndex = -1; private BoundSheetRecord[] orderedBSRs; @SuppressWarnings("rawtypes")private ArrayList boundSheetRecords = new ArrayList(); // For handling formulas with string results private int nextRow; private int nextColumn; private boolean outputNextStringRecord; private int curRow; private List<String> rowlist; @SuppressWarnings( "unused") private String sheetName; public HxlsAbstract(){ super(); this.output = System.out; this.minColumns = -1; this.curRow = 0; this.rowlist = new ArrayList<String>(); } //excel记录行操作方法,以行索引和行元素列表为参数,对一行元素进行操作,元素为String类型 // public abstract void optRows(int curRow, List<String> rowlist) throws SQLException ; //excel记录行操作方法,以sheet索引,行索引和行元素列表为参数,对sheet的一行元素进行操作,元素为String类型 public abstract void optRows(int sheetIndex,int curRow, List<String> rowlist) throws SQLException; /** * 遍历 excel 文件 */ public void process(String filename) throws IOException { this.fs = new POIFSFileSystem(new FileInputStream(filename)); MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); if (outputFormulaValues) { request.addListenerForAllRecords(formatListener); } else { workbookBuildingListener = new SheetRecordCollectingListener(formatListener); request.addListenerForAllRecords(workbookBuildingListener); } factory.processWorkbookEvents(request, fs); } /** * HSSFListener 监听方法,处理 Record */ @SuppressWarnings("unchecked") public void processRecord(Record record) { int thisRow = -1; int thisColumn = -1; String thisStr = null; String value = null; switch (record.getSid()) { case BoundSheetRecord.sid: boundSheetRecords.add(record); break; case BOFRecord.sid: BOFRecord br = (BOFRecord) record; if (br.getType() == BOFRecord.TYPE_WORKSHEET) { // Create sub workbook if required if (workbookBuildingListener != null && stubWorkbook == null) { stubWorkbook = workbookBuildingListener .getStubHSSFWorkbook(); } // Works by ordering the BSRs by the location of // their BOFRecords, and then knowing that we // process BOFRecords in byte offset order sheetIndex++; if (orderedBSRs == null) { orderedBSRs = BoundSheetRecord .orderByBofPosition(boundSheetRecords); } sheetName = orderedBSRs[sheetIndex].getSheetname(); } break; case SSTRecord.sid: sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; thisRow = brec.getRow(); thisColumn = brec.getColumn(); thisStr = ""; break; case BoolErrRecord.sid: BoolErrRecord berec = (BoolErrRecord) record; thisRow = berec.getRow(); thisColumn = berec.getColumn(); thisStr = ""; break; case FormulaRecord.sid: FormulaRecord frec = (FormulaRecord) record; thisRow = frec.getRow(); thisColumn = frec.getColumn(); if (outputFormulaValues) { if (Double.isNaN(frec.getValue())) { // Formula result is a string // This is stored in the next record outputNextStringRecord = true; nextRow = frec.getRow(); nextColumn = frec.getColumn(); } else { thisStr = formatListener.formatNumberDateCell(frec); } } else { thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"'; } break; case StringRecord.sid: if (outputNextStringRecord) { // String for formula StringRecord srec = (StringRecord) record; thisStr = srec.getString(); thisRow = nextRow; thisColumn = nextColumn; outputNextStringRecord = false; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; curRow = thisRow = lrec.getRow(); thisColumn = lrec.getColumn(); value = lrec.getValue().trim(); value = value.equals("")?" ":value; this.rowlist.add(thisColumn, value); break; case LabelSSTRecord.sid: LabelSSTRecord lsrec = (LabelSSTRecord) record; curRow = thisRow = lsrec.getRow(); thisColumn = lsrec.getColumn(); if (sstRecord == null) { rowlist.add(thisColumn, " "); } else { value = sstRecord .getString(lsrec.getSSTIndex()).toString().trim(); value = value.equals("")?" ":value; rowlist.add(thisColumn,value); } break; case NoteRecord.sid: NoteRecord nrec = (NoteRecord) record; thisRow = nrec.getRow(); thisColumn = nrec.getColumn(); // TODO: Find object to match nrec.getShapeId() thisStr = '"' + "(TODO)" + '"'; break; case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; curRow = thisRow = numrec.getRow(); thisColumn = numrec.getColumn(); value = formatListener.formatNumberDateCell(numrec).trim(); value = value.equals("")?" ":value; // Format rowlist.add(thisColumn, value); break; case RKRecord.sid: RKRecord rkrec = (RKRecord) record; thisRow = rkrec.getRow(); thisColumn = rkrec.getColumn(); thisStr = '"' + "(TODO)" + '"'; break; default: break; } // 遇到新行的操作 if (thisRow != -1 && thisRow != lastRowNumber) { lastColumnNumber = -1; } // 空值的操作 if (record instanceof MissingCellDummyRecord) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; curRow = thisRow = mc.getRow(); thisColumn = mc.getColumn(); rowlist.add(thisColumn," "); } // 如果遇到能打印的东西,在这里打印 if (thisStr != null) { if (thisColumn > 0) { output.print(','); } output.print(thisStr); } // 更新行和列的值 if (thisRow > -1) lastRowNumber = thisRow; if (thisColumn > -1) lastColumnNumber = thisColumn; // 行结束时的操作 if (record instanceof LastCellOfRowDummyRecord) { if (minColumns > 0) { // 列值重新置空 if (lastColumnNumber == -1) { lastColumnNumber = 0; } } // 行结束时, 调用 optRows() 方法 lastColumnNumber = -1; try { optRows(sheetIndex,curRow, rowlist); } catch (SQLException e) { e.printStackTrace(); } rowlist.clear(); } } }
POI解析xlsx
public abstract class HxlsxAbstract extends DefaultHandler { private SharedStringsTable sst; private String lastContents; private boolean nextIsString; private int sheetIndex = -1; private List<String> rowlist = new ArrayList<String>(); private int curRow = 0; private int curCol = 0; /** * 读取第一个工作簿的入口方法 * @param path */ public void readOneSheet(String path) throws Exception { OPCPackage pkg = OPCPackage.open(path); XSSFReader r = new XSSFReader(pkg); SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = fetchSheetParser(sst); InputStream sheet = r.getSheet("rId1"); InputSource sheetSource = new InputSource(sheet); parser.parse(sheetSource); sheet.close(); } /** * 读取所有工作簿的入口方法 * @param path * @throws Exception */ public void process(String path) throws Exception { OPCPackage pkg = OPCPackage.open(path); XSSFReader r = new XSSFReader(pkg); SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = fetchSheetParser(sst); Iterator<InputStream> sheets = r.getSheetsData(); while (sheets.hasNext()) { curRow = 0; sheetIndex++; InputStream sheet = sheets.next(); InputSource sheetSource = new InputSource(sheet); parser.parse(sheetSource); sheet.close(); } } /** * 该方法自动被调用,每读一行调用一次,在方法中写自己的业务逻辑即可 * @param sheetIndex 工作簿序号 * @param curRow 处理到第几行 * @param rowList 当前数据行的数据集合 */ public abstract void optRow(int sheetIndex, int curRow, List<String> rowList); public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException { XMLReader parser = XMLReaderFactory .createXMLReader(); this.sst = sst; parser.setContentHandler(this); return parser; } public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { // c => 单元格 if (name.equals("c")) { // 如果下一个元素是 SST 的索引,则将nextIsString标记为true String cellType = attributes.getValue("t"); if (cellType != null && cellType.equals("s")) { nextIsString = true; } else { nextIsString = false; } } // 置空 lastContents = ""; } public void endElement(String uri, String localName, String name) throws SAXException { // 根据SST的索引值的到单元格的真正要存储的字符串 // 这时characters()方法可能会被调用多次 if (nextIsString) { try { int idx = Integer.parseInt(lastContents); lastContents = new XSSFRichTextString(sst.getEntryAt(idx)) .toString(); } catch (Exception e) { } } // v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引 // 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符 if (name.equals("v")) { String value = lastContents.trim(); value = value.equals("") ? " " : value; rowlist.add(curCol, value); curCol++; } else { // 如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法 if (name.equals("row")) { optRow(sheetIndex, curRow, rowlist); rowlist.clear(); curRow++; curCol = 0; } } } public void characters(char[] ch, int start, int length) throws SAXException { // 得到单元格内容的值 lastContents += new String(ch, start, length); } }
解析csv
public abstract class HcsvAbstract {public int lineIndex=-1;public void process(String filename) throws IOException{ InputStream is=new FileInputStream(new File(filename)); BufferedReader reader = new BufferedReader(new InputStreamReader(is,"gbk")); // reader.readLine();//第一行信息,为标题信息,不用,如果需要,注释掉 String line = null; while((line=reader.readLine())!=null){ lineIndex++; String item[] = line.split(",");//CSV格式文件为逗号分隔符文件,这里根据逗号切分 List<String> lineList=new ArrayList<String>(); for(int i=0;i<item.length;i++){ lineList.add(item[i]); } optRows(lineIndex, lineList); //int value = Integer.parseInt(last);//如果是数值,可以转化为数值 } }public abstract void optRows(int lineIndex, List<String> lineList);}
以上为3个抽象类,当需要解析文本时,继承对应的抽象类,重写里面的抽象方法 optRows。此方法每次读取一行自动调用一次,参数包括读取的文件信息,可以用来处理业务逻辑。读取文件时,调用父类process(String filePath)方法,参数是文件地址。
0 0
- 解析大数据量文本
- 对于大数据量的Json解析
- 大数据量10道面试题及解析
- Java 用sax解析大文本xml
- 利用fastjson解析大文本JSON
- 将txt文本读入C中(txt数据量大),将C中数据写入txt
- 最快,最具可扩展性的文本导入方法 –大数据量加载最佳实践
- C#实现大数据量TXT文本数据快速高效去重
- 大数据量处理
- 数据库大数据量处理
- 大数据量处理
- CC1100大数据量通过
- 大数据量inset处理
- 大数据量分页
- 大数据量分页显示
- 大数据量更新SQL
- jxl 处理大数据量
- DataFactory生成大数据量
- Building for UN
- H5游戏引擎为什么选择egret和怎么学习egret
- 创建Maven项目下的Dubbo+Zookeeper框架
- css3 clip-path
- android之Itent.ACTION_PICK Intent.ACTION_GET_CONTENT妙用
- 解析大数据量文本
- Java volatite 关键字的理解
- Dubbo与Zookeeper、SpringMVC整合和使用(负载均衡、容错)
- 349. Intersection of Two Arrays*
- Android事件分发机制完全解析,带你从源码的角度彻底理解(上)
- Linux Samba服务器配置
- 转-Spring Boot 快速入门
- angularjs 验证(w5cValidator 2.0 验证信息框架的封装)
- Hibernate3中持久化对象的状态