直接查询索引,将想要的字段写入csv文件

来源:互联网 发布:淘宝店铺名片 编辑:程序博客网 时间:2024/04/28 12:02
package test;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.DocsAndPositionsEnum;import org.apache.lucene.index.Fields;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.MultiFields;import org.apache.lucene.index.Terms;import org.apache.lucene.index.TermsEnum;import org.apache.lucene.search.DocIdSetIterator;import org.apache.lucene.search.spell.LuceneDictionary;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.BytesRef;import org.apache.lucene.util.BytesRefIterator;import org.junit.Test;public class TermSearcher {public static final SimpleDateFormat SDF_CHILD_PATH = new SimpleDateFormat("yyyyMMdd_HHmmssSSS");public static Map<String, Integer> sizeMap=new HashMap<String, Integer>();public static void main(String[] args) throws IOException {searchTerms();}/** * 搜索terms */public static void searchTerms(){List<String> pathList=new ArrayList<String>();pathList.add("D:/newindex/1");pathList.add("D:/newindex/2");try{for(String indexReadPath : pathList){Directory directory = null;    directory=FSDirectory.open(new File(indexReadPath));//打开索引文件夹    IndexReader reader=DirectoryReader.open(directory);//读取目录    Fields fields = MultiFields.getFields(reader);//    for(String field : fields){//    System.out.println(field);//    }  //输出某field下的各个term的数量    maptolist(fields,indexReadPath.split("/")[2]);    reader.close();//关闭资源    directory.close();//关闭连接    }}catch(Exception e){    e.printStackTrace();    }    }//输出某field下的各个term的数量public static void maptolist(Fields fields,String indexReadPath) throws IOException{List<String> fieldlist = new ArrayList<String>();fieldlist.add("brand_Name");fieldlist.add("virtual_Name");List<String> list = new ArrayList<String>();BytesRef byteRef = null;        String byterefString="";        int size=1;        //fieldsfor(String field : fieldlist){Terms terms = fields.terms(field);        TermsEnum termsEnum = terms.iterator(null);        while ((byteRef = termsEnum.next() )!= null) {        byterefString=byteRef.utf8ToString();        if(sizeMap.get(byterefString)!=null){        size++;        }        sizeMap.put(byterefString, size);                }                Set<String> key = sizeMap.keySet();        list.add(field);        for (Iterator<String> it = key.iterator(); it.hasNext();) {        String s = it.next();        list.add(s+","+sizeMap.get(s));        //这里的s就是map中的key,map.get(s)就是key对应的value。        if(list.size()>=30000){            write(list,"F:/Terms",indexReadPath);                list.clear();                list.add(field);            }        }        write(list,"F:/Terms",indexReadPath);}}public static void write(List<String> list, String path ,String indexReadPath){StringBuffer sbBuffer=new StringBuffer();for(String string : list){sbBuffer.append(string+"\n");}        FileWriter fw = null;        BufferedWriter bf = null;        PrintWriter out = null;        File file  = null;        String name = path  + File.separator + indexReadPath+"-" +SDF_CHILD_PATH.format(new Date()) + ".csv";        try {            file = new File(name);            if (!file.exists()) {                file.createNewFile();            }            fw = new FileWriter(file);            bf = new BufferedWriter(fw);            out = new PrintWriter(bf);            out.write(sbBuffer.toString());            out.flush();        } catch (IOException e) {        } finally {            if (out != null) {                try {                    out.close();                } catch (RuntimeException e) {                }            }            if (bf != null) {                try {                    bf.close();                } catch (IOException e) {                }            }            if (fw != null) {                try {                    fw.close();                } catch (IOException e) {                }            }        }    }/**     * 显示所有的索引     * @throws IOException     */@Test    public static void showIndex() throws IOException {    String indexReadPath= "D:/newindex/1";Directory directory = null;    directory=FSDirectory.open(new File(indexReadPath));//打开索引文件夹    IndexReader reader=DirectoryReader.open(directory);//读取目录        reader = DirectoryReader.open(directory);        Fields fields = MultiFields.getFields(reader); //获取directory中所有的field    //显示 field 中 context的所有的分词     Terms terms = fields.terms("brand_Name");    TermsEnum termsEnum =  terms.iterator(null);    BytesRef term = null;    int count=1;    while ((term=termsEnum.next()) !=null) {        System.out.println("分词的内容>>>>>>>"+term.utf8ToString()+"\t");//分词的内容        System.out.println("出现该分词的有文档的数量>>>>>>>>>"+termsEnum.docFreq()+"\t");//出现该分词的有文档的数量        System.out.println("分词的总数>>>>>>>"+termsEnum.totalTermFreq()+"\t");//分词的总数        DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);        //如果要查询的字段 没有被分词 ,docsAndPositionsEnum就会为空 继续循环        if(docsAndPositionsEnum==null){            continue;        }        int docId ;        while ((docId = docsAndPositionsEnum.nextDoc())!= DocIdSetIterator.NO_MORE_DOCS) {            Document document = reader.document(docId);//获取document对象            System.out.println(docId+"\t");//分词的总数            System.out.println("可以获取document中field的值>>>>>>>>"+document.get("brand_Name")+"\t");//可以获取document中field的值            int freq = docsAndPositionsEnum.freq();//该document中 该分词出现的次数            for (int i = 0; i < freq; i++) {                System.out.println("分词的位置>>>>>>>"+docsAndPositionsEnum.nextPosition()+":"); //分词的位置                 System.out.print("分词起始偏移量的位置>>>["+docsAndPositionsEnum.startOffset()+"");//分词起始偏移量的位置                 System.out.println(docsAndPositionsEnum.endOffset()+"],>>>>分词结束偏移量的位置");//分词结束偏移量的位置                 System.out.println(docsAndPositionsEnum.getPayload()+"\t");            }        }        count++;        if(count>=100){        return;        }    }    //            for (String field : fields) {}        reader.close();    }public static void getTerms(IndexReader reader,String field) throws IOException{System.out.println("---------------getTerms----------------");LuceneDictionary ld = new LuceneDictionary( reader, field );    BytesRefIterator iterator = ld.getEntryIterator();    BytesRef byteRef = null;    String outputString = "";    while ( ( byteRef = iterator.next() ) != null )    {    System.out.println(">>>>>>>>>>>>>outputString"+outputString);        String term = byteRef.utf8ToString();        System.out.println(term);    }}}

0 0
原创粉丝点击