Lucene与Tag图

来源:互联网 发布:中国人是需要管的 知乎 编辑:程序博客网 时间:2024/04/29 14:17

Tag图是采用了Tag作为文章管理工具的网站经常需要呈现的一种视图。利用Lucene的优异性能,可以出色的完成这一功能。

生成一个Tag图,首先需要知道用于一共使用了哪些Tag,其次需要知道每个Tag被使用的次数。

对于这两个功能,都可以使用Lucene.Index.IndexReader.Terms方法。这个方法返回索引目录下所有Term,以及他们在全部文档中被使用的次数。这就为我们生成Tag提供了必要的基础。但是Terms方法返回的TermEnum的排序算法是按照FieldName,text的方式排序的,而不是按照docfreq排序的,所以需要还实现一个排序算法。

首先是索引的结构。我设计了如下的索引结构:

docurl:文档的url

contents:文档的内容,以便全文索引

doctags:文档相关的所有tags.tag以空格或逗号作为分割,可以使用单独的Analyzer进行解析。可以参考Analyzer以及PerFieldAnalyzerWrapper两个类。
排序算法,使用一个链表作为保存Tag的形式。它的两个方法GetList(int top)和Top(int freq)可以帮助我们设定Tag图中需要包含的Tag。TermFreq是每个Tag的数据内容。TermFreq.term是Tag的内容。TermFreq.freq是被使用的次数,这样就可以设定Tag的显示样式了。链表通过一个SortedList作为帮助信息,以便提高排序的效率。经过测试,这个排序算法对200M的TermFreq只需要11秒的时间。

  1 internal class TermFreq
  2         {
  3             public string term;
  4             public int freq = 0;
  5         }
  6         internal class TermFreqCompare : System.Collections.IComparer
  7         {
  8             #region IComparer 成员
  9 
 10             public int Compare(object x, object y)
 11             {
 12                 TermFreq f1 = x as TermFreq;
 13                 TermFreq f2 = y as TermFreq;
 14                 int compareResult = f1.freq.CompareTo(f2.freq);
 15                 //if(compareResult==0) return f2.term.CompareTo(f1.term);
 16                 return compareResult;
 17             }
 18 
 19             #endregion
 20 
 21         }
 22         internal class TermFreqSortedList
 23         {
 24             private Element root;
 25             private System.Collections.IComparer comparer;
 26             private System.Collections.SortedList list;
 27             internal class Element
 28             {
 29                 public Element prev;
 30                 public Element next;
 31                 public TermFreq current;
 32             }
 33             public TermFreqSortedList(System.Collections.IComparer comparer)
 34             {
 35                 root = new Element();
 36                 root.current = new TermFreq();
 37                 this.comparer = comparer;
 38                 list = new System.Collections.SortedList();
 39             }
 40             private Element GetStartElement(int freq)
 41             {
 42                 Element ele = null;
 43                 if(list.ContainsKey(freq))
 44                 {
 45                     ele = list[freq] as Element;
 46                 }
 47                 else
 48                 {
 49                     list.Add(freq,null);
 50                     int index = list.IndexOfKey(freq)-1;
 51                     if(index<0) ele = list[0as Element;
 52                     else ele = list[index] as Element;
 53                 }
 54                 return ele;
 55             }
 56             public void Add(TermFreq o)
 57             {
 58                 Element ele = GetStartElement(o.freq);
 59                 if(ele==null) ele = root;
 60                 Element oEle = new Element();
 61                 oEle.current = o;
 62                 list[oEle.current.freq] = oEle;
 63                 while(ele !=null)
 64                 {
 65                     int compareResult = comparer.Compare(ele.current,oEle.current);
 66                     if(compareResult>0)
 67                     {
 68                         if(ele.next==null)
 69                         {
 70                             ele.next = oEle;
 71                             oEle.prev = ele;
 72                             break;
 73                         }
 74                         else if(comparer.Compare(ele.next.current,oEle.current)<0)
 75                         {
 76                             ele.next.prev = oEle;
 77                             oEle.next = ele.next;
 78                             ele.next = oEle;
 79                             oEle.prev = ele;
 80                             break;
 81                         }
 82                         else
 83                         {
 84                             ele = ele.next;
 85                             continue;
 86                         }
 87                     }
 88                     else if(compareResult<0)
 89                     {
 90                         if(ele.prev==null)
 91                         {
 92                             ele.prev = oEle;
 93                             oEle.next = ele;
 94                             root = oEle;
 95                             break;
 96                         }
 97                         else if(comparer.Compare(ele.prev.current,oEle.current)>0)
 98                         {
 99                             ele.prev.next = oEle;
100                             oEle.prev = ele.prev;
101 
102                             ele.prev = oEle;
103                             oEle.next = ele;
104                             break;
105                         }
106                         else
107                         {
108                             ele = ele.prev;
109                             continue;
110                         }
111                     }
112                     if(ele.prev!=null)
113                     {
114                         ele.prev.next = oEle;
115                         oEle.prev = ele.prev;
116                     }
117                     else
118                     {
119                         root = oEle;
120                     }
121                     oEle.next = ele;
122                     ele.prev = oEle;
123                     break;
124                 }
125             }
126             public System.Collections.ArrayList GetList(int top)
127             {
128                 System.Collections.ArrayList list = new System.Collections.ArrayList();
129                 Element ele = root;
130                 int i=0;
131                 while((i++)<top)
132                 {
133                     list.Add(ele.current);
134                     if(ele.next == null)
135                     {
136                         return list;
137                     }
138                     ele = ele.next;
139                 }
140                 return list;
141             }
142             public System.Collections.ArrayList Top(int freq)
143             {
144                 System.Collections.ArrayList list = new System.Collections.ArrayList();
145                 Element ele = root;
146                 while(ele.current.freq >= freq)
147                 {
148                     list.Add(ele.current);;
149                     if(ele.next==null)
150                         return list;
151                     ele = ele.next;
152                 }
153                 return list;
154             }
155         }

文档生成的代码:

 

1 Document doc = new Document();        
2 doc.Add(Field.Keyword("docurl", docurl));
3 doc.Add(Field.Text("contents",contents));
4 //storeTermVector==true.这样我们以后就可以通过TermFreqVector来访问tag在每个文档中被标注的次数了,以便生成单个文档的Tag图
5 doc.Add(Field.Text("doctags", reader,true));

测试代码:

 1 Lucene.Net.Index.TermEnum enu = reader.Terms(new Term("contents","_"));
 2                 TermFreqSortedList list = new TermFreqSortedList(new TermFreqCompare());
 3                 
 4                 while(enu.Next())
 5                 {
 6                     Lucene.Net.Index.Term t = enu.Term();
 7                     
 8                     TermFreq f = new TermFreq();
 9                     f.freq = enu.DocFreq();
10                     f.term = t.Text();
11                     list.Add(f);
12                 }
13                 for(System.Collections.IEnumerator ienu = list.GetList(5).GetEnumerator();ienu.MoveNext();)
14                 {
15                     TermFreq ff = ienu.Current as TermFreq;
16                     
17                     Console.WriteLine(string.Format("Term:{0}./t/t/tDocFreq:{1}",
18                         ff.term,
19                         ff.freq));
20                 }
21                 for(System.Collections.IEnumerator ienu = list.Top(3).GetEnumerator();ienu.MoveNext();)
22                 {
23                     TermFreq ff = ienu.Current as TermFreq;
24                     
25                     Console.WriteLine(string.Format("Term:{0}./t/t/tDocFreq:{1}",
26                         ff.term,
27                         ff.freq));
28                 }


 

原创粉丝点击