Lucene.Net 实现groupby并可以加入自定义过滤功能

来源：互联网发布：js 获取数组长度编辑：程序博客网时间：2024/06/05 06:50

背景:在一个项目中搜索产品库,需要同一公司只显示一个产品的过滤功能.由于为了性能项目已经采用lucene.Net全文搜索架构,因此要"同一公司一个产品"的过滤功能就只能在lucene搜索上下功夫了.

本文是在Lucene.Net 2.9.2的源码里改的,下面开始通过源码来介绍,

一首先修改IndexSearcher.cs文件,在此文件中增加 groupby排序的字段属性"FieldName",同时增加给此字段赋值的方法 GroupBy(string FieldName),注意应用时,在调用Search方法之前先调用GroupBy方法.

IndexSearcher.cs添加如下代码

   /// <summary>
        /// 增加GroupBy字段
        /// </summary>
        private string fieldName;
        /// <summary>
        /// 给TopDocCollector类的Collect方法使用。
        /// </summary>
        public string FieldName
        {
            get { return fieldName; }
        }
        /// <summary>
        /// 在调用Search方法前一定要调用该方法。
        /// </summary>
        /// <param name="fieldName"></param>
        public void GroupBy(string fieldName)
        {
            this.fieldName = fieldName;
        }
二 TopFieldDocCollector.cs 该类的新增一个TopFieldDocCollector初始化方法,在新方法的增加IndexSearcher,

然后修改Collect方法,下面开始是排序效果的关键了

先定义一个泛型,用于保存groupby字段的值,在collect方法中去判断值有没有存在dict ,如果存在则跳转到下一条.

   /// <summary>
        /// 注入IndexSearcherExtension对象
        /// </summary>
        private IndexSearcher searcher;
        /// <summary>
        /// 构造函数注入对象
        /// </summary>
        /// <param name="numHits"></param>
        /// <param name="searcher"></param>
        public TopFieldDocCollector(IndexReader reader, Sort sort, int numHits,IndexSearcher searcher)
            : base(new FieldSortedHitQueue(reader, sort.fields, numHits))
        {
            this.searcher = searcher;
        }

/// <summary>
/// 临时数据，用于排重
/// </summary>
private Dictionary<int, int> dict = new Dictionary<int, int>();

   /// <summary>
        /// 临时数据，用于排重
        /// </summary>
        private Dictionary<int, int> dict = new Dictionary<int, int>();
  // javadoc inherited
  public override void Collect(int doc, float score)
  {
   if (score > 0.0f)
   {
               //排重算法
                if (!string.IsNullOrEmpty(searcher.FieldName))
                {
                    IndexReader reader = searcher.GetIndexReader();
                    Document docment = reader.Document(doc);
                    string value = docment.Get(searcher.FieldName).Trim();
                    if (!dict.ContainsKey(int.Parse(value)))
                    {
                        dict.Add(int.Parse(value), int.Parse(value));
                    }
                    else
                    {
                        return;
                    }

                }
    totalHits++;
    if (reusableFD == null)
     reusableFD = new FieldDoc(doc, score);
    else
    {
     // Whereas TopScoreDocCollector can skip this if the
     // score is not competitive, we cannot because the
     // comparators in the FieldSortedHitQueue.lessThan
     // aren't in general congruent with "higher score
     // wins"
     reusableFD.score = score;
     reusableFD.doc = doc;
    }
    reusableFD = (FieldDoc) hq.InsertWithOverflow(reusableFD);
   }
  }

三修改IndexSearcher.cs中将原来调用TopFieldDocCollector(IndexReader reader, Sort sort, int numHits)初始化的方法改为

新方法 (IndexReader reader, Sort sort, int numHits,IndexSearcher searcher)

四测试

 static void Main(string[] args)        {            Lucene.Net.Analysis.Analyzer analyzer = null;            analyzer = new PanGuAnalyzer();  //这里用了盘古分词的方法            PanGu.Segment.Init(@"E:\工作\LED\Project\demo\Lucene.Net_2_9\Lucene.Net_2_9_2\PanGu.xml");            IndexWriter writer = new IndexWriter("e:\\index", analyzer, true);            Document doc = new Document();            doc.Add(new Field("pro_Name", "LED节能灯", Field.Store.YES, Field.Index.ANALYZED));            doc.Add(new Field("pro_Mem_ID", "61", Field.Store.YES, Field.Index.NO));            doc.Add(new Field("pro_Attr", ",61:12,13:led,14:129,", Field.Store.YES, Field.Index.ANALYZED));            doc.Add(new Field("pro_price", "100", Field.Store.YES, Field.Index.UN_TOKENIZED));            Document doc2 = new Document();            doc2.Add(new Field("pro_Name", "LED日光灯", Field.Store.YES, Field.Index.ANALYZED));            doc2.Add(new Field("pro_Mem_ID", "62", Field.Store.YES, Field.Index.NO));            doc2.Add(new Field("pro_Attr", ",61:12,13:led,14:128,", Field.Store.YES, Field.Index.ANALYZED));            doc2.Add(new Field("pro_price", "200", Field.Store.YES, Field.Index.UN_TOKENIZED));            Document doc3 = new Document();            doc3.Add(new Field("pro_Name", "LED灯", Field.Store.YES, Field.Index.ANALYZED));            doc3.Add(new Field("pro_Mem_ID", "63", Field.Store.YES, Field.Index.NO));            doc3.Add(new Field("pro_Attr", ",61:12,", Field.Store.YES, Field.Index.ANALYZED));            doc3.Add(new Field("pro_price", "220", Field.Store.YES, Field.Index.UN_TOKENIZED));            writer.AddDocument(doc);            writer.AddDocument(doc2);            writer.AddDocument(doc3);            writer.Close();            IndexSearcher searcher = new IndexSearcher("e:\\index");            BooleanQuery boolQuery = new BooleanQuery();            string queryString = "LED";            if (queryString != null && queryString != string.Empty && queryString != "")            {                boolQuery.Add(MultiFieldQueryParser.Parse(new string[] { queryString }, new string[] { "pro_Name" }, analyzer), BooleanClause.Occur.SHOULD);            }            searcher.GroupBy("pro_Mem_ID");                      Sort sort = new Sort();            SortField f2 = new SortField("pro_price", SortField.FLOAT, false);            sort.SetSort(new SortField[]{f2});                       Hits docs = searcher.Search(boolQuery,null, sort);            for (int i = 0; i < docs.Length(); i++)            {                Console.WriteLine(docs.Doc(i).Get("pro_Name") + "---" + docs.Doc(i).Get("pro_Mem_ID") + "--" + docs.Doc(i).Get("pro_Attr"));            }            searcher.Close();            Console.ReadKey();        }

五小结关键在于TopFieldDocCollector.cs中的collect方法,在这个方法中大家也以可加入自定义过滤规则,规则的参数可以通过IndexSearch类中初始化.