基于Sphinx的实例解析:DISCUZ全文检索

来源:互联网 发布:淘宝主板可靠吗 编辑:程序博客网 时间:2024/06/09 14:34
这里我主要讲重点,第一个是基于discuz的索引配置文件,这个配置文件比较灵活,可以根据不同的需求来配置
    #    # LinuxTone full index search configure file    #    source lt_posts    {    type = mysql    sql_host = 127.0.0.1    sql_user = root    sql_pass =    sql_db = lt_bbs    sql_port = 3306    sql_query_pre = SET NAMES utf8    sql_query = SELECT pid,tid,fid,dateline,subject,message,author FROM cdb_posts where first=1 #此处是基于posts表来做索引的,这样的目的是可以同时检索到subject,message,author 三个字段的值    sql_attr_uint = fid    sql_attr_timestamp = dateline    sql_query_info = SELECT * FROM cdb_posts WHERE pid=$id    }    index lt_posts    {    source = lt_posts    path = /data/sphinx/data/lt_posts    docinfo = extern    mlock = 0    morphology = none    min_word_len = 2    html_strip = 1    charset_dictpath = /usr/local/mmseg-3.2.13/etc/    charset_type = zh_cn.utf-8    ngram_len = 0    }    ########## 增量索引 ##################    source delta    {    type = mysql    sql_host = 127.0.0.1    sql_user = root    sql_pass =    sql_db = lt_bbs    sql_port = 3306 # optional, default is 3306    sql_query_pre = SET NAMES utf8    sql_query = SELECT pid,tid,fid,dateline,subject,message,author FROM cdb_posts where first=1 and dateline > unix_timestamp()-3600*10 #增量索引采用当前时间戳减去一个需要间隔的时间来新建新增的数据索引    sql_attr_uint = fid    sql_attr_timestamp = dateline    sql_query_info = SELECT * FROM cdb_posts WHERE pid=$id    }    index delta    {    source = delta    path = /data/sphinx/data/lt_delta    docinfo = extern    mlock = 0    morphology = none    min_word_len = 2    html_strip = 1    charset_dictpath = /usr/local/mmseg-3.2.13/etc/    charset_type = zh_cn.utf-8    ngram_len = 0    }    indexer    {    mem_limit = 32M    }    searchd    {    port = 9312    log = /data/sphinx/var/log/searchd.log    query_log = /data/sphinx/var/log/query.log    read_timeout = 5    max_children = 30    pid_file = /data/sphinx/var/log/searchd.pid    max_matches = 10000    seamless_rotate = 1    preopen_indexes = 0    unlink_old = 1    }


sphinx最主要的就是这个配置文件,当然在增量索引部分可以写一个脚本放到crontab里面来定时跑

下面介绍下sphinx的PHP调用部分,sphinx的接口采用PHP的扩展,可以通过pecl或者http://pecl.php.net/package/sphinx来安装
    <?php    /**    * LinuxTone全文搜索服务    */    define('IN_DISCUZ', TRUE);    require_once './include/common.inc.php';    $q = isset($_GET['q']) && !empty($_GET['q']) ? $_GET['q'] : '';    $q = str_replace(array('<','>',' ','\'',','),array('','',' ','',''),strip_tags($q));    $page = isset($_GET['page']) && intval($_GET['page'])>0 ? intval($_GET['page']) : 1;    $perNum = 20;    $offset = ($page - 1) * $perNum;    $search = new SphinxClient();    $search->setServer('127.0.0.1',9312);    $search->setConnectTimeout(2);    $search->setArrayResult(true);    $search->setMatchMode(SPH_MATCH_ANY);    $search->setRankingMode(SPH_RANK_PROXIMITY_BM25);    $search->setSortMode(SPH_SORT_EXTENDED,'@relevance desc,@weight desc');    $search->setLimits($offset,$perNum);    $search->setFieldWeights(array('subject'=>2000,'message'=>0));    $rs = array();    $query_totals = $query_time = 0;    if(!empty($q)){             $rs = $search->Query($q,"*");             $pages = ceil($rs['total']/$perNum);             $query_totals = $rs['total_found'];             $query_time = $rs['time'];    }    $data = $title = $content = array();    if(!empty($rs) && $page <= $pages){               $pids = array();               foreach($rs['matches'] as $v){                       $pids[] = $v['id'];             }             $pid = implode(',',$pids);             $sql = "select pid,tid,author,authorid,subject,message,dateline from cdb_posts where pid IN($pid) and status ='0' and invisible='0'";             $query = $db->query($sql);             while($row = $db->fetch_array($query)){                       $data[] = $row;                       $title[] = $row['subject'];                       $content[] = preg_replace('/\[[\/]?(b|img|url|color|s|hr|p|list|i|align|email|u|font|code|hide|table|tr|td|th|attach|list|indent|float).*\]/','',strip_tags($row['message']));             }             //搜索词高亮             $opts = array();             $opts['before_match'] = '';             $opts['after_match'] = '';             $title = $search->BuildExcerpts($title,'lt_posts',$q,$opts);             $content = $search->BuildExcerpts($content,'lt_posts',$q,$opts);             foreach($data as $k=>$v){                       $data[$k]['subject'] = $title[$k];                       $data[$k]['message'] = $content[$k];             }             $url = "s.php?q=".urlencode($q);             $multipage = multi($rs['total'], $perNum, $page, $url);    }    include template("lt_search");    ?>


跑主索引的shell脚本search-index.sh
  1. #!/bin/bash
  2. #
  3. # The BBS search exec full index
  4. #
  5. /usr/local/csft-3.2.13/bin/indexer -c /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate lt_posts >> /data/sphinx/var/`date "+%Y-%m-%d-%H"`.log

跑增量索引的shell脚本search-delta.sh
  1. #!/bin/bash
  2. #
  3. # The BBS search exec delta index
  4. #
  5. #跑增量索引
  6. /usr/local/csft-3.2.13/bin/indexer -c /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate delta
  7. #合并主索引和增量索引
  8. #/usr/local/csft-3.2.13/bin/indexer --config /usr/local/csft-3.2.13/etc/lt_posts.conf --rotate --merge lt_posts delta
0 0