lucene对文件做简单的索引

来源:互联网 发布:ip 网络层 编辑:程序博客网 时间:2024/04/27 11:25
package com.mylucene;import java.io.File;import java.io.FileReader;import java.io.IOException;import java.io.Reader;import java.nio.CharBuffer;import java.util.ArrayList;import java.util.List;import org.apache.lucene.LucenePackage;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;public class MyLuceneTest {        /**     * 根据内容,构建索引     * @param analyzer     * @param directory     * @param items     * @return     */    private boolean buildIndexer(Analyzer analyzer, Directory directory, List<Item> items) {        IndexWriter iwriter = null;        try {            // 配置索引            iwriter = new IndexWriter(directory, new IndexWriterConfig(                    Version.LUCENE_47, analyzer));            // 删除所有document            iwriter.deleteAll();            // 将文档信息存入索引            Document doc[] = new Document[items.size()];            for (int i = 0; i < items.size(); i++) {                doc[i] = new Document();                           Item item = items.get(i);                java.lang.reflect.Field[] fields = item.getClass().getDeclaredFields();                for (java.lang.reflect.Field field : fields) {                    String fieldName = field.getName();                   // System.out.println(fieldName);                    String getMethodName = "get"+toFirstLetterUpperCase(fieldName);                    Object obj = item.getClass().getMethod(getMethodName).invoke(item);                    //System.out.println((String)obj);                    doc[i].add(new Field(fieldName, (String)obj, TextField.TYPE_STORED));                   // Field field1 = new Field("", new FileReader(new File("")));                   // doc[1].add(field1);                }                                iwriter.addDocument(doc[i]);            }        } catch (Exception e) {            e.printStackTrace();            return false;        } finally {            try {                iwriter.close();            } catch (IOException e) {            }        }        return true;    }        /**     * 根据keyword搜索索引     * @param analyzer     * @param directory     * @param keyword     * @return     */    public List<Item> searchIndexer(Analyzer analyzer, Directory directory, String keyword) {        DirectoryReader ireader = null;        List<Item> result = new ArrayList<Item>();        try {            // 设定搜索目录            ireader = DirectoryReader.open(directory);            IndexSearcher isearcher = new IndexSearcher(ireader);            // 对多field进行搜索            java.lang.reflect.Field[] fields = Item.class.getDeclaredFields();            int length = fields.length;            String[] multiFields = new String[length];            for (int i = 0; i < length; i++) {                multiFields[i] = fields[i].getName();            }            MultiFieldQueryParser parser = new MultiFieldQueryParser(                    Version.LUCENE_47, multiFields, analyzer);            // 设定具体的搜索词            Query query = parser.parse(keyword);            ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;            for (int i = 0; i < hits.length; i++) {                Document hitDoc = isearcher.doc(hits[i].doc);                Item item = new Item();                for (String field : multiFields) {                    String setMethodName = "set"+toFirstLetterUpperCase(field);                    item.getClass().getMethod(setMethodName, String.class).invoke(item, hitDoc.get(field));                }                result.add(item);            }        } catch (Exception e) {            e.printStackTrace();            return null;        } finally {            try {                ireader.close();                directory.close();            } catch (IOException e) {            }        }        return result;    }        /**     * 首字母转大写     * @param str     * @return     */    public static String toFirstLetterUpperCase(String str) {          if(str == null || str.length() < 2){              return str;          }          return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());       }          public static void main(String[] args) throws Exception {    System.out.println(LucenePackage.get());        MyLuceneTest demo = new MyLuceneTest();        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);       // Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);                List<Item> items = new ArrayList<Item>();        /*items.add(new Item("1", "中国", "This is the text to be greatly indexed."));        items.add(new Item("2", "second", "This is great"));        items.add(new Item("3", "third", "I love apple and pear. "));        items.add(new Item("4", "four", "我是中国人"));        items.add(new Item("5", "five", "中华人民共和国"));                */File dataFile = new File("C:/mylucene");        File[] dataFiles = dataFile.listFiles();        for(int i = 0; i < dataFiles.length; i++){        Reader txtReader = new FileReader(dataFiles[i]);        char []buff = new char[10000];        txtReader.read(buff);        String str = String.valueOf(buff);        System.out.println(buff);        items.add(new Item(dataFiles[i].getCanonicalPath(),dataFiles[i].getName(),str));        //System.out.println(dataFiles[i].getCanonicalPath());        //System.out.println(dataFiles[i].getName());        //System.out.println(buff);        //System.out.println(txtReader.toString());        }                // 索引存到内存中的目录        //Directory directory = new RAMDirectory();        // 索引存储到硬盘        File file = new File("c:/lucene");        Directory directory = FSDirectory.open(file);        demo.buildIndexer(analyzer, directory, items);        List<Item> result = demo.searchIndexer(analyzer, directory, "中国");                for (Item item : result) {            System.out.println(item.toString());        }    }}

package com.mylucene;
public class Item {
   
    private String id;
    private String title;
    private String content;
   
    public Item() {
    }
   
    public Item(String id, String title, String content) {
        this.id = id;
        this.title = title;
        this.content = content;
    }
   
    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getContent() {
        return content;
    }
    public void setContent(String content) {
        this.content = content;
    }
   
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("[id=").append(id).append(",title=").append(title)
            .append(",content=").append(content).append("]");
        return sb.toString();
    }
}

这里是将文件的的三个属性进行了一下抽象,并且运用另一个类去表示,在以前版本中是运用Reader进行读取文件,并且在文件进行添加索引的时候直接对Reader读取的对象进行添加,不需要将其所有进行读出都进行封装。这里就是文件非常大的时候内存将会存不下,导致内存不足或者数组越界的可能。这里应该还可以像以前版本一样可以直接对文件建立索引的,我相信是我没有找到好的解决办法,所以应该多研究一下4.8的api。


0 0
原创粉丝点击