Lucene.Net 全文索引笔记

来源：互联网发布：sql查询成绩最高分编辑：程序博客网时间：2024/05/16 11:52

Lucene.Net用了又忘...由于现在信息量爆炸，用过的东西用完就忘，只好自己写个笔记来记录一下了...

1：需要DLL

Lucene.Net.dll

PanGu.dll

PanGu.HighLight.dll

PanGu.Lucene.Analyzer.dll

没有的话，可以去我的资源包里面下，地址如下： http://download.csdn.net/download/kimizhou_blog/10016313

2；生成索引

string indexPath = Context.Server.MapPath("~/App_Data/IndexData");//索引文档保存位置string commonProductIndexPath = string.Format("{0}/{1}", indexPath, "commonProduct");    //积分商城产品 //开始处理 积分商城产品索引CreateCommonProductIndex(commonProductIndexPath);

然后看看CreateCommonProductIndex方法

      /// <summary>        /// 创建积分商城产品索引        /// </summary>        /// <param name="indexPath"></param>        private void CreateCommonProductIndex(string indexPath)        {            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());//绑定索引目录            bool isExist = IndexReader.IndexExists(directory);            if (isExist)            {                if (IndexWriter.IsLocked(directory))                {                    IndexWriter.Unlock(directory);                }            }            IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isExist, IndexWriter.MaxFieldLength.UNLIMITED);            writer.DeleteAll();//先删之前的索引            IList<ProductInfoByIndex> list = Product.GetProductListByIndex();            foreach (var item in list)            {                Document document = new Document();                 document.Add(new Field("id", item.ProductID.ToString(), Field.Store.YES, Field.Index.ANALYZED));//--所有字段的值都将以字符串类型保存 因为索引库只存储字符串类型数据                string Content = string.Format("{0}", item.ProductName);                document.Add(new Field("Content", Content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));                writer.AddDocument(document); //文档写入索引库            }            writer.Close();//会自动解锁            directory.Close(); //不要忘了Close，否则索引结果搜不到        }

其中
IList<ProductInfoByIndex> list = Product.GetProductListByIndex();方式是去数据中读取这个list对象，这里代码就不贴出来了。到这里你的索引已经创建出来的，那么接下来需要查询和显示

查询是最困难的，各种匹配

3：查询索引并且显示出来

GetProductIndex方法就是获取索引代码如下：

    /// <summary>        /// 获取积分商品索引        /// </summary>        private void GetProductIndex()        {            string indexPath = Context.Server.MapPath("~/App_Data/IndexData");//索引文档保存位置            string commonProductIndexPath = string.Format("{0}/{1}", indexPath, "commonProduct");    //积分商城产品            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(commonProductIndexPath), new NoLockFactory());            IndexReader reader = IndexReader.Open(directory, true);            IndexSearcher searcher = new IndexSearcher(reader);            BooleanQuery bQuery = new BooleanQuery();            foreach (string word in SplitContent.SplitWords(Request["SearchKey"]))            {                Query queryUseringNatrue = new WildcardQuery(new Term("Content", "*" + word + "*"));                bQuery.Add(queryUseringNatrue, BooleanClause.Occur.MUST);// MUST 必须            }            Sort sort = new Sort(new SortField("id", SortField.FLOAT, true)); //true为降序排序             TopDocs docs = searcher.Search(bQuery, (Filter)null, 9999999, sort);            List<ProductInfoByIndex> proList = new List<ProductInfoByIndex>();            for (int i = 0; i < docs.totalHits; i++)            {                Document doc = searcher.Doc(docs.scoreDocs[i].doc);                ProductInfoByIndex product = new ProductInfoByIndex();                product.ProductID = System.Convert.ToInt32(doc.Get("id"));                product.ProductName = doc.Get("Content");                //product.ProductName = SplitContent.HightLight(Request["SearchKey"], doc.Get("Content"));                proList.Add(product);            }            productResultList = proList;            this.Message += string.Format("|{0}条积分商城产品", docs.totalHits);            //PhraseQuery query = new PhraseQuery();            //foreach (string word in SplitContent.SplitWords(Request["SearchKey"]))            //{            //    query.Add(new Term("Content", word));            //}            //query.SetSlop(100);            //TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);            //searcher.Search(query, null, collector);            //ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;            //List<ProductInfoByIndex> proList = new List<ProductInfoByIndex>();            //for (int i = 0; i < docs.Length; i++)            //{            //    int docId = docs[i].doc;//得到查询结果文档的id（Lucene内部分配的id）            //    Document doc = searcher.Doc(docId);//根据文档id来获得文档对象Document            //    ProductInfoByIndex product = new ProductInfoByIndex();            //    product.ProductID = System.Convert.ToInt32(doc.Get("id"));            //    //book.ContentDescription = doc.Get("content");//未使用高亮            //    //搜索关键字高亮显示 使用盘古提供高亮插件            //    product.ProductName = SplitContent.HightLight(Request["SearchKey"], doc.Get("Content"));            //    proList.Add(product);            //}            //    productResultList = proList;            //this.Message += string.Format("|{0}条积分商城产品", docs.Length);        }

其中我注释掉的，是另外一种方法，这里我用的效率比较慢的模糊查询
Query queryUseringNatrue = new WildcardQuery(new Term("Content", "*" + word + "*"));
这个类似数据库的like '%关键字%'
到这里就已经获取到了所有的索引资料了，是不是很简单，你get到了吗？最后我再给大家介绍索引的几种查询方式：
第1种：

  //string keyWordUseringNatrue = "营运";            //if (!string.IsNullOrWhiteSpace(keyWordUseringNatrue))            //{            //    QueryParser parseUseringNatrue = new QueryParser("UseringNatrue", new PanGuAnalyzer());            //    Query query = parseUseringNatrue.Parse(keyWordUseringNatrue);            //    parseUseringNatrue.SetDefaultOperator(QueryParser.Operator.AND);            //    bQuery.Add(query, BooleanClause.Occur.MUST);            //}            //营运            //Query queryUseringNatrue = new WildcardQuery(new Term("UseringNatrue", "营运"));             //bQuery.Add(queryUseringNatrue, BooleanClause.Occur.MUST);// MUST 必须

这个查询是什么呢？是一般的查询，会查询出运营相关的，但是他和like不一样，他跟分词有关，比如说，“爱” 就查询不出 “可爱” ，pangu有自己的分词，但是这个比较常用，下面汇总一下其它的查询：
其它查询汇总：

//介绍各种Query            //TermQuery： 首先介绍最基本的查询，如果你想执行一个这样的查询：在content字段中查询包含‘刘备的document”，那么你可以用TermQuery：            // Term t = new Term("content", "刘备");            // Query query = new TermQuery(t);            //BooleanQuery ：如果你想这么查询：在content字段中包含”刘备“并且在title字段包含”三国“的document”，那么你可以建立两个TermQuery并把它们用BooleanQuery连接起来：            //1             TermQuery termQuery1 = new TermQuery(new Term("content", "刘备"));            //2             TermQuery termQuery2 = new TermQuery(new Term("title", "三国"));            //3             BooleanQuery booleanQuery = new BooleanQuery();            //4             booleanQuery.Add(termQuery1, BooleanClause.Occur.SHOULD);            //5             booleanQuery.Add(termQuery2, BooleanClause.Occur.SHOULD);            //WildcardQuery ：如果你想对某单词进行通配符查询，你可以用WildcardQuery，通配符包括’?’匹配一个任意字符和’*’匹配零个或多个任意字符，例如你搜索’三国*’，你可能找到’三国演义’或者’三国志’：            //1             Query query = new WildcardQuery(new Term("content", "三国*"));            //PhraseQuery ：你可能对中日关系比较感兴趣，想查找‘中’和‘日’挨得比较近（5个字的距离内）的文章，超过这个距离的不予考虑，你可以            //1             PhraseQuery query = new PhraseQuery();            //2             query.SetSlop(5);            //3             query.Add(new Term("content ", "中"));            //4             query.Add(new Term("content", "日"));            //那么它可能搜到“中日合作……”、“中方和日方……”，但是搜不到“中国某高层领导说日本欠扁”            //PrefixQuery ：如果你想搜以‘中’开头的词语，你可以用PrefixQuery：            //1             PrefixQuery query = new PrefixQuery(new Term("content ", "中"));            //FuzzyQuery ：FuzzyQuery用来搜索相似的term，使用Levenshtein算法。假设你想搜索跟‘wuzza’相似的词语，你可以：            //1             Query query = new FuzzyQuery(new Term("content", "wuzza"));            //你可能得到‘fuzzy’和‘wuzzy’。            //RangeQuery： 另一个常用的Query是RangeQuery，你也许想搜索时间域从20060101到20060130之间的document，你可以用RangeQuery：            //1             RangeQuery query = new RangeQuery(new Term("time","20060101"), new Term("time","20060130"), true);            //最后的true表示用闭合区间。

因为各个版本，他们使用的都不太一样，下面介绍一种常用的读取以后显示的方式，其中Sort就是排序

Stopwatch stopwath = new Stopwatch();//秒表             Sort sort = new Sort(new SortField("CarPrice", SortField.FLOAT,true)); //true为降序排序 CarPrice为价格 SortField.DOC是？            TopDocs docs = searcher.Search(bQuery, (Filter)null, 9999999, sort);            stopwath.Stop();//秒表停止            long lSearchTime = stopwath.ElapsedMilliseconds;//耗时            List<CarSourceInfoByIndex> carSourceResult = new List<CarSourceInfoByIndex>();            for (int i=0;i<docs.totalHits; i++)            {                Document doc = searcher.Doc(docs.scoreDocs[i].doc);                CarSourceInfoByIndex carSource = new CarSourceInfoByIndex()                {                    Id = int.Parse(doc.Get("Id")),                    CarPrice = System.Convert.ToDouble(doc.Get("CarPrice")),                    Recommended = SplitContent.HightLight(Request["SearchKey"], doc.Get("Content"))                 };                carSourceResult.Add(carSource);            }            carSourceResultList2 = carSourceResult;            this.Message += string.Format("{0}条测试", docs.totalHits);

就到这里了，不懂的可以加我QQ 10200454咨询

阅读全文

0 0