盘古分词 lucene.net

来源:互联网 发布:打广告软件 编辑:程序博客网 时间:2024/04/19 23:26

-初始化

在进程启动时,我们需要对盘古分词进行初始化,初始化的调用代码如下:

PanGu.Segment.Init();

PanGu.Segment.Init(filename);

filename 为pangu.xml 的完整路径名,如 “c:\pangu.xml”

  

-开始记录索引

            //创建索引文件夹            FSDirectory indexDir = FSDirectory.Open(new DirectoryInfo("E:/luceneTest"));            //索引记录器                                                         是否新建            IndexWriter writer = new IndexWriter(indexDir, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED);            //初始化数据            var listArticles = InitArticles();            foreach (var article in listArticles)                //写入数据到索引                IndexString(writer, article.Url, article.Title, article.Time, article.Content);            writer.Optimize();            writer.Close();


  帮助方法 public static int IndexString(IndexWriter writer, string url, string title, DateTime time, string content)

        /// <summary>        /// 记录索引方法        /// </summary>        /// <param name="writer"></param>        /// <param name="url">url地址</param>        /// <param name="title">标题</param>        /// <param name="time">时间</param>        /// <param name="content">内容</param>        /// <returns>返回影响多少条</returns>        public static int IndexString(IndexWriter writer, string url, string title, DateTime time, string content)        {            //IndexWriter writer = new IndexWriter(indexDir, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED);            //初始化文档对象            Document doc = new Document();            //新建保存字段                       是否保存字段    保存索引方式            Field field = new Field("url", url, Field.Store.YES, Field.Index.NO);            doc.Add(field);            field = new Field("Title", title, Field.Store.YES, Field.Index.ANALYZED);            doc.Add(field);            field = new Field("Time", time.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED);            doc.Add(field);            field = new Field("Content", content, Field.Store.YES, Field.Index.ANALYZED);            doc.Add(field);            writer.AddDocument(doc);            int num = writer.NumDocs();            return num;        }



-开始记录索引


 /// <summary>        /// 分词方法        /// </summary>        /// <param name="keywords">需要被分词的内容</param>        /// <param name="ktTokenizer"></param>        /// <returns></returns>        public string GetKeyWordsSplitBySpace(string keywords, PanGuTokenizer ktTokenizer)        {            StringBuilder result = new StringBuilder();            ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keywords);            foreach (WordInfo word in words)            {                if (word == null)                {                    continue;                }                result.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));            }            return result.ToString().Trim();        }

        /// <summary>        /// 搜索方法        /// </summary>        /// <param name="q">关键词</param>        /// <param name="pageLen">每页大小</param>        /// <param name="pageNo">当前页数</param>        /// <param name="recCount">总共条数</param>        /// <returns></returns>        public  List<Article> SearchData(String q, int pageLen, int pageNo, out int recCount)        {            string keywords = q;            //索引位置            FSDirectory indexDir = FSDirectory.Open(@"E:\luceneTest");            //搜索对象            IndexSearcher search = new IndexSearcher(indexDir, true);            //分词            q = GetKeyWordsSplitBySpace(q, new PanGuTokenizer());            QueryParser queryParser = new QueryParser(Version.LUCENE_30, "Content", new PanGuAnalyzer(true));            Query query = queryParser.Parse(q);            QueryParser titleQueryParser = new QueryParser(Version.LUCENE_30,"title", new PanGuAnalyzer(true));            Query titleQuery = titleQueryParser.Parse(q);            //查询 表示两者的结合            BooleanQuery bq = new BooleanQuery();            bq.Add(query, Occur.SHOULD);            bq.Add(titleQuery, Occur.SHOULD);            var hits = search.Search(bq, pageLen);                        List<Article> result = new List<Article>();            recCount = hits.TotalHits;            int i = (pageNo - 1) * pageLen;            while (i < recCount && result.Count < pageLen)            {                Article news = null;                try                {                    Document doc = search.Doc(hits.ScoreDocs[i].Doc);                    news = new Article();                    news.Title = doc.Get("Title");                    news.Content = doc.Get("Content");                    news.Url = doc.Get("url");                                       //高亮显示关键字                    PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =                        new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");                    PanGu.HighLight.Highlighter highlighter =                        new PanGu.HighLight.Highlighter(simpleHTMLFormatter,                        new Segment());                    //显示内容多少                    highlighter.FragmentSize = 500;                    //有高亮显示高亮 没则原样显示                    news.ContentLighter = highlighter.GetBestFragment(keywords, news.Content);                    news.TitleLighter = highlighter.GetBestFragment(keywords, news.Title);                    if (string.IsNullOrEmpty(news.TitleLighter.Trim()))                    {                        news.TitleLighter = news.Title;                    }                }                catch (Exception e)                {                    throw e;                }                finally                {                    result.Add(news);                    i++;                }            }            search.Dispose();            return result;        }


0 0
原创粉丝点击