【tika】读取各个文件数据

来源:互联网 发布:手机建站cms 编辑:程序博客网 时间:2024/06/06 12:22

首先我们到官网下载tika的jar包

1.将jar包引入到项目中:



接下来我们就可以进行数据的读取

1.读取普通本文

package cn.qblank.tika;import java.io.File;import org.apache.tika.Tika;/** * 获取普通文本的数据 * 使用Tika facade 类从文件中提取文本 * @author evan_qb */public class ReadText {public static void main(String[] args) throws Exception {File file = new File("D:/test/test.txt");Tika tika = new Tika();String content = tika.parseToString(file);content = new String(content.getBytes("ISO-8859-1"),"gbk");System.out.println("文件内容为:\n" + content);}}




2.读取xml的数据

package cn.qblank.tika;import java.io.File;import java.io.FileInputStream;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.xml.XMLParser;import org.apache.tika.sax.BodyContentHandler;/** * 读取xml文件 * @author Administrator */public class ReaderXML {public static void main(String[] args) throws Exception{  //检测文件类型      BodyContentHandler handler = new BodyContentHandler();      Metadata metadata = new Metadata();      FileInputStream inputstream = new FileInputStream(new File("d:/test/contact.xml"));      ParseContext pcontext = new ParseContext();      //转换为xml      XMLParser xmlparser = new XMLParser();       xmlparser.parse(inputstream, handler, metadata, pcontext);      System.out.println("XML文件内容:\n" + handler.toString());      System.out.println("元数据内容:");      String[] metadataNames = metadata.names();      for(String name : metadataNames) {         System.out.println(name + ": " + metadata.get(name));      }}}

xml文件:


读取结果:




3.读取html

package cn.qblank.tika;import java.io.File;import java.io.FileInputStream;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.html.HtmlParser;import org.apache.tika.sax.BodyContentHandler;/** * 获取HTML的数据 * @author Administrator */public class ReaderXHTML {public static void main(String[] args) throws Exception {//检测html文件BodyContentHandler handler = new BodyContentHandler();    Metadata metadata = new Metadata();    FileInputStream inputstream = new FileInputStream(new File("D:/test/test1.html"));    ParseContext pcontext = new ParseContext();    //转换为HTML    HtmlParser htmlparser = new HtmlParser();    htmlparser.parse(inputstream, handler, metadata,pcontext);    System.out.println("文档内容:\n" + handler.toString());    System.out.println("元数据:");    String[] metadataNames = metadata.names();    for(String name : metadataNames) {       System.out.println(name + ":   " + metadata.get(name));      }}}

html文件如下:


读取结果如下:



4.读取Excel文档:

package cn.qblank.tika;import java.io.File;import java.io.FileInputStream;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;import org.apache.tika.sax.BodyContentHandler;/** * tika入门:获取Excel文档的数据 * @author evan_qb */public class ReaderSheet {public static void main(String[] args) throws Exception {//检测文件类型    BodyContentHandler handler = new BodyContentHandler();    Metadata metadata = new Metadata();    FileInputStream inputstream = new FileInputStream(new File("d:/Writesheet.xlsx"));    ParseContext pcontext = new ParseContext();    //使用OOXMLParser转换器    OOXMLParser  msofficeparser = new OOXMLParser ();     msofficeparser.parse(inputstream, handler, metadata,pcontext);    System.out.println("数据内容:\n" + handler.toString());    System.out.println("元数据:");    String[] metadataNames = metadata.names();    for(String name : metadataNames) {       System.out.println(name + ": " + metadata.get(name) );    }}}

Excel数据:



读取数据:




参考文件: http://www.yiibai.com/tika/tika_architecture.html