利用lucene获取tf-idf
来源:互联网 发布:软件研发项目管理制度 编辑:程序博客网 时间:2024/05/16 01:27
http://www.tianyalinfeng.com/post/530
package baike;import java.io.File;import java.util.List;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldType;import org.apache.lucene.index.AtomicReader;import org.apache.lucene.index.AtomicReaderContext;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsEnum;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.Version;/** * 利用lucene获取tf-idf * * @author yong.chen * */public class LuceneTfIdfUtil { public static final String INDEX_PATH = "c:/testindex"; public void index() { try { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); Directory directory = FSDirectory.open(new File(INDEX_PATH)); IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); IndexWriter iwriter = new IndexWriter(directory, config); FieldType ft = new FieldType(); ft.setIndexed(true);// 存储 ft.setStored(true);// 索引 ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);// 存储位置 ft.setStoreTermVectorOffsets(true);// 存储偏移量 Document doc = new Document(); String text = "This is the text to be indexed."; doc.add(new Field("text", text, ft)); iwriter.addDocument(doc); doc = new Document(); text = "I am the text to be stored."; doc.add(new Field("text", text, ft)); iwriter.addDocument(doc); iwriter.forceMerge(1);// 最后一定要合并为一个segment,不然无法计算idf iwriter.close(); } catch (Exception e) { } } /** * 读取索引,显示词频 * * **/ public void getTF() { try { Directory directroy = FSDirectory.open(new File( INDEX_PATH)); IndexReader reader = DirectoryReader.open(directroy); for (int i = 0; i < reader.numDocs(); i++) { int docId = i; System.out.println("第" + (i + 1) + "篇文档:"); Terms terms = reader.getTermVector(docId, "text"); if (terms == null) continue; TermsEnum termsEnum = terms.iterator(null); BytesRef thisTerm = null; while ((thisTerm = termsEnum.next()) != null) { String termText = thisTerm.utf8ToString(); DocsEnum docsEnum = termsEnum.docs(null, null); while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println("termText:" + termText + " TF: " + 1.0 * docsEnum.freq() / terms.size()); } } } reader.close(); directroy.close(); } catch (Exception e) { e.printStackTrace(); } } /* * 计算IDF * * * */ public void getIDF() { try { Directory directroy = FSDirectory.open(new File(INDEX_PATH)); IndexReader reader = DirectoryReader.open(directroy); List<AtomicReaderContext> list = reader.leaves(); System.out.println("文档总数 : " + reader.maxDoc()); for (AtomicReaderContext ar : list) { String field = "text"; AtomicReader areader = ar.reader(); Terms terms = areader.terms(field); TermsEnum tn = terms.iterator(null); BytesRef text; while ((text = tn.next()) != null) { System.out.println("field=" + field + "; text=" + text.utf8ToString() + " IDF : " + Math.log10(reader.maxDoc() * 1.0 / tn.docFreq()) // + " 全局词频 : " + tn.totalTermFreq() ); } } reader.close(); directroy.close(); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { LuceneTfIdfUtil luceneTfIdfUtil = new LuceneTfIdfUtil(); // luceneTfIdfUtil.index(); luceneTfIdfUtil.getTF(); luceneTfIdfUtil.getIDF(); }}
0 0
- 利用lucene获取tf-idf
- 利用lucene获取tf-idf
- Lucene获取TF、IDF等信息
- Lucene TF-IDF 相关性算分公式
- 利用TF-IDF 提取文章关键词
- Python 使用nltk获取TF-IDF
- TF/IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- TF-IDF
- 类方法与实例方法的区别
- Microsoft Azure Premium Storage SSD Test
- 大小端模式与网络字节序
- java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Integer
- android 替换fragment不能全屏问题.
- 利用lucene获取tf-idf
- jQuery插件jqplot的详细配置说明和渲染器
- UIScrollView控件介绍
- 软件项目版本号的命名规则及格式
- ubuntu crontab设置编辑器
- 微信公众平台java开发详解(工程代码+解析)
- 土豆春季实习试题之惨烈教训
- OC视频笔记5.1(类目的概念和使用)5.2(延展)
- SAT阅读文章解题步骤和技巧