Tika提取pdf文本的内容,并用IKAnalyzer进行分词处理。

来源:互联网 发布:录音软件audition 编辑:程序博客网 时间:2024/04/28 12:25

package test;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

/**
 * 此类用于提取pdf文件的文本内容
 *
 * @author gujie
 *
 */
public class TikaUtil {

   public String getBody(File file) throws Exception {


     Parser parser = new AutoDetectParser();


     InputStream input = new FileInputStream(file);


     Metadata meta = new Metadata();


     System.out.println(meta.get(Metadata.CONTENT_ENCODING));


     ContentHandler handler = new BodyContentHandler();


     parser.parse(input, handler, meta, new ParseContext());


     return handler.toString();
 }

 public static void main(String[] args) {


  try {


   System.out.println(new TikaUtil().getBody(new File("f:\\哈哈哈哈.pdf")));


  } catch (Exception e) {


   e.printStackTrace();


  }
 }

}

 

 

package test;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;

import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;

/**
 * 此类用于处理分词
 * @author gujie
 *
 */
public class IKAnalyzerTest {

 /**
  * @param args
  */
 public static void main(String[] args) throws Exception {

  long start = System.currentTimeMillis();


  IKSegmentation ikSeg = new IKSegmentation(new StringReader(new TikaUtil().getBody(new File("f:\\哈哈哈哈.pdf"))) ,true);


  long end = System.currentTimeMillis();


  try {


   Lexeme l = null;


   while( (l = ikSeg.next()) != null){


    System.out.println(l.getLexemeText());//循环打印出分词之后的结果


   }


  } catch (IOException e) {


   e.printStackTrace();


  }
  System.out.println("耗时:"+(end-start)+"毫秒");
 }

}


 

原创粉丝点击