站内搜索 lucenenet 和 pangu4lucene分词

来源:互联网 发布:mac steam好玩的游戏 编辑:程序博客网 时间:2024/05/17 01:20

注意lucene的版本不要超过3.0


下载地址  lucenenet:   http://lucenenet.apache.org/    pangu4lucene:http://pangusegment.codeplex.com/


分词代码:

private IEnumerable<string> SpliteWord(string word)    {        List<string> listResu = new List<string>();        Analyzer analyzer = new PanGuAnalyzer();        TokenStream tokenStream = analyzer.TokenStream("", new StringReader(word));        Lucene.Net.Analysis.Token token = null;        while ((token = tokenStream.Next()) != null)//reader.Read()//只要还有词,就不返回null        {            listResu.Add(token.TermText());//token.TermText()为当前分的词        }        return listResu;    }

建立索引代码:

private void BuildIndex(Model model)    {        string indexPath = @"C:\Users\Administrator\Desktop\asp.net\articleManager\articleManagerWeb\article";//注意和磁盘上文件夹的大小写一致,否则会报错。        FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());        bool isUpdate = IndexReader.IndexExists(directory);//判断索引库是否存在        if (isUpdate)        {            //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁            //Lucene.Net在写索引库之前会自动加锁,在close的时候会自动解锁            //不能多线程执行,只能处理意外被永远锁定的情况            if (IndexWriter.IsLocked(directory))            {                IndexWriter.Unlock(directory);//un-否定。强制解锁            }        }        IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);        Document document = new Document();//一条Document相当于一条记录        document.Add(new Field("id", (model.Id).ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//每个Document可以有自己的属性(字段),所有字段名都是自定义的,值都是string类型         document.Add(new Field("title", model.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));//与上面不同 这里分词        document.Add(new Field("msg", model.Msg, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));        writer.DeleteDocuments(new Term("id", model.Id.ToString()));//防止存在的数据//delete from t where id=i        //如果不存在则删除0条        writer.AddDocument(document);//把文档写入索引库        writer.Close();        directory.Close();//不要忘了Close,否则索引结果搜不到    }

查找代码:

 protected void btnSearch_Click(object sender, EventArgs e)    {        string indexPath = @"C:\Users\Administrator\Desktop\asp.net\articleManager\articleManagerWeb\article";        string kw = txtSearch.Text;        FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());//打开目录中的文件        IndexReader reader = IndexReader.Open(directory, true);        IndexSearcher searcher = new IndexSearcher(reader);        PhraseQuery query = new PhraseQuery();//查询条件        foreach (var ky in SpliteWord(kw))        {            query.Add(new Term("msg", ky));//where contains("msg",kw)        }               query.SetSlop(100);//两个词的距离大于100(经验值)就不放入搜索结果,因为距离太远相关度就不高了        TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放查询结果的容器        searcher.Search(query, null, collector);//使用query这个查询条件进行搜索,搜索结果放入collector        //collector.GetTotalHits()总的结果条数        ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;//从查询结果中取出第m条到第n条的数据        List<Model> list = new List<Model>();        for (int i = 0; i < docs.Length; i++)//遍历查询结果        {            int docId = docs[i].doc;//拿到文档的id。因为Document可能非常占内存(DataSet和DataReader的区别)            //所以查询结果中只有id,具体内容需要二次查询            Document doc = searcher.Doc(docId);//根据id查询内容。放进去的是Document,查出来的还是Document            //Console.WriteLine(doc.Get("id"));            //Console.WriteLine(doc.Get("msg"));            Model result = new Model();            result.Id = Convert.ToInt64(doc.Get("id"));            result.Title = doc.Get("title");//只有 Field.Store.YES的字段才能用Get查出来            result.Msg = doc.Get("msg");//只有 Field.Store.YES的字段才能用Get查出来            list.Add(result);        }        Repeater1.DataSource = list;        Repeater1.DataBind();    }


0 0
原创粉丝点击