Linux下Sphinx 安装和全文检索—1

来源：互联网发布：淘宝店铺名片编辑：程序博客网时间：2024/04/28 14:13
注意：sphinx只能检索英语文章要检索中文要用到带中文分词coreseek 的sphinx
Sphinx检索的原理不在此讲述。
环境准备
我的源代码编译的lamp环境：
Centos 6.4 Linux
MySQL-5.0 .41 mysql
Apache-2.22 Apache
Php-5.2.6 php
软件准备
Coreseek-3.2.14.tar.gz //支持中文全文检索的软件
http://www.coreseek.cn/uploads/csft/3.2/coreseek-3.2.14.tar.gz
Sphinx-0.9.9.tar.gz //sphinx 源代码软件
http://www.sphinxsearch.com/downloads/sphinx-0.9.9.tar.gz
Sphinx-1.1.0.tarz //为php准备sphinx模块的源代码
http://pecl.php.net/get/sphinx-1.1.0.tgz
sphinx的安装
下载到sphinx包后解压将源码放到 /usr/local/ 目录下
tar –zxvf sphinx-0.9.9.tar.gz
cd sphinx-0.9.9/
./configure –prefix=/usr/local/sphinx–with-mysql=/usr/local/mysql/
Make
Make install
以上几步就已经将sphinx安装成功！注意最主要的是配置sphinx
我开始配置sphinx
cd /usr/local/sphinx/etc/
cpsphinx.conf.dist sphinx.conf
vi sphinx.conf //进行配置如下
## Sphinx configuration file sample## WARNING! While this sample file mentions all available options,# it contains (very) short helper descriptions only. Please refer to# doc/sphinx.html for details.################################################################################ data source definition#############################################################################source main{# data source type. mandatory, no default value# known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbctype= mysql####################################################################### SQL settings (for 'mysql' and 'pgsql' types)###################################################################### some straightforward parameters for SQL source typessql_host= localhostsql_user= rootsql_pass= rootsql_db= testsql_port= 3306# optional, default is 3306# UNIX socket name# optional, default is empty (reuse client library defaults)# usually '/var/lib/mysql/mysql.sock' on Linux# usually '/tmp/mysql.sock' on FreeBSD# sql_sock= /tmp/mysql.sock# MySQL specific client connection flags# optional, default is 0## mysql_connect_flags= 32 # enable compression# MySQL specific SSL certificate settings# optional, defaults are empty## mysql_ssl_cert= /etc/ssl/client-cert.pem# mysql_ssl_key= /etc/ssl/client-key.pem# mysql_ssl_ca= /etc/ssl/cacert.pem# MS SQL specific Windows authentication mode flag# MUST be in sync with charset_type index-level setting# optional, default is 0## mssql_winauth= 1 # use currently logged on user credentials# MS SQL specific Unicode indexing flag# optional, default is 0 (request SBCS data)## mssql_unicode= 1 # request Unicode data from server# ODBC specific DSN (data source name)# mandatory for odbc source type, no default value## odbc_dsn= DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};# sql_query= SELECT id, data FROM documents.csv# pre-query, executed before the main fetch query# multi-value, optional, default is empty list of queries# sql_query_pre= SET NAMES utf8 sql_query_pre= SET SESSION query_cache_type=OFF# main document fetch query# mandatory, integer document ID field MUST be the first selected columnsql_query = select id, title, content from post;                                     # range query setup, query that must return min and max ID values# optional, default is empty## sql_query will need to reference $start and $end boundaries# if using ranged query:## sql_query= \#SELECT doc.id, doc.id AS group, doc.title, doc.data \#FROM documents doc \#WHERE id>=$start AND id<=$end## sql_query_range= SELECT MIN(id),MAX(id) FROM documents# range query step# optional, default is 1024## sql_range_step= 1000# unsigned integer attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# optional bit size can be specified, default is 32## sql_attr_uint= author_id# sql_attr_uint= forum_id:9 # 9 bits for forum_id# sql_attr_uint= group_id# boolean attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# equivalent to sql_attr_uint with 1-bit size## sql_attr_bool= is_deleted# bigint attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# declares a signed (unlike uint!) 64-bit attribute## sql_attr_bigint= my_bigint_id# UNIX timestamp attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# similar to integer, but can also be used in date functions## sql_attr_timestamp= posted_ts# sql_attr_timestamp= last_edited_ts#sql_attr_timestamp= date_added# string ordinal attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# sorts strings (bytewise), and stores their indexes in the sorted list# sorting by this attr is equivalent to sorting by the original strings## sql_attr_str2ordinal= author_name# floating point attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# values are stored in single precision, 32-bit IEEE 754 format## sql_attr_float = lat_radians# sql_attr_float = long_radians# multi-valued attribute (MVA) attribute declaration# multi-value (an arbitrary number of attributes is allowed), optional# MVA values are variable length lists of unsigned 32-bit integers## syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]# ATTR-TYPE is 'uint' or 'timestamp'# SOURCE-TYPE is 'field', 'query', or 'ranged-query'# QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs# RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'## sql_attr_multi= uint tag from query; SELECT id, tag FROM tags# sql_attr_multi= uint tag from ranged-query; \#SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \#SELECT MIN(id), MAX(id) FROM tags# post-query, executed on sql_query completion# optional, default is empty## sql_query_post=# post-index-query, executed on successful indexing completion# optional, default is empty# $maxid expands to max document ID actually fetched from DB## sql_query_post_index = REPLACE INTO counters ( id, val ) \#VALUES ( 'max_indexed_id', $maxid )# ranged query throttling, in milliseconds# optional, default is 0 which means no delay# enforces given delay before each query stepsql_ranged_throttle= 0# document info query, ONLY for CLI search (ie. testing and debugging)# optional, default is empty# must contain $id macro and must fetch the document by that idsql_query_info= SELECT * FROM post WHERE id=$id# kill-list query, fetches the document IDs for kill-list# k-list will suppress matches from preceding indexes in the same query# optional, default is empty## sql_query_killlist= SELECT id FROM documents WHERE edited>=@last_reindex# columns to unpack on indexer side when indexing# multi-value, optional, default is empty list## unpack_zlib = zlib_column# unpack_mysqlcompress = compressed_column# unpack_mysqlcompress = compressed_column_2# maximum unpacked length allowed in MySQL COMPRESS() unpacker# optional, default is 16M## unpack_mysqlcompress_maxsize = 16M####################################################################### xmlpipe settings###################################################################### type= xmlpipe# shell command to invoke xmlpipe stream producer# mandatory## xmlpipe_command= cat /usr/local/sphinx/var/test.xml####################################################################### xmlpipe2 settings###################################################################### type= xmlpipe2# xmlpipe_command= cat /usr/local/sphinx/var/test2.xml# xmlpipe2 field declaration# multi-value, optional, default is empty## xmlpipe_field= subject# xmlpipe_field= content# xmlpipe2 attribute declaration# multi-value, optional, default is empty# all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX## xmlpipe_attr_timestamp= published# xmlpipe_attr_uint= author_id# perform UTF-8 validation, and filter out incorrect codes# avoids XML parser choking on non-UTF-8 documents# optional, default is 0## xmlpipe_fixup_utf8= 1}# inherited source example## all the parameters are copied from the parent source,# and may then be overridden in this source definition#source src1throttled : src1#{#sql_ranged_throttle= 100#}############################################################################### index definition############################################################################## local index example## this is an index which is stored locally in the filesystem## all indexing-time options (such as morphology and charsets)# are configured per local indexindex main{# document source(s) to index# multi-value, mandatory# document IDs must be globally unique across all sourcessource=main # index files path and file name, without extension# mandatory, path must be writable, extensions will be auto-appendedpath= /usr/local/sphinx/var/data/main# document attribute values (docinfo) storage mode# optional, default is 'extern'# known values are 'none', 'extern' and 'inline'docinfo= extern# memory locking for cached data (.spa and .spi), to prevent swapping# optional, default is 0 (do not mlock)# requires searchd to be run from rootmlock= 0# a list of morphology preprocessors to apply# optional, default is empty## builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',# 'soundex', and 'metaphone'; additional preprocessors available from# libstemmer are 'libstemmer_XXX', where XXX is algorithm code# (see libstemmer_c/libstemmer/modules.txt)## morphology = stem_en, stem_ru, soundex# morphology= libstemmer_german# morphology= libstemmer_svmorphology= none# minimum word length at which to enable stemming# optional, default is 1 (stem everything)## min_stemming_len= 1# stopword files list (space separated)# optional, default is empty# contents are plain text, charset_table and stemming are both applied## stopwords= /usr/local/sphinx/var/data/stopwords.txt# wordforms file, in "mapfrom > mapto" plain text format# optional, default is empty## wordforms= /usr/local/sphinx/var/data/wordforms.txt# tokenizing exceptions file# optional, default is empty## plain text, case sensitive, space insensitive in map-from part# one "Map Several Words => ToASingleOne" entry per line## exceptions= /usr/local/sphinx/var/data/exceptions.txt# minimum indexed word length# default is 1 (index everything)min_word_len= 1# charset encoding type# optional, default is 'sbcs'# known types are 'sbcs' (Single Byte CharSet) and 'utf-8'charset_type= utf-8# charset definition and case folding rules "table"# optional, default value depends on charset_type## defaults are configured to include English and Russian characters only# you need to change the table to include additional ones# this behavior MAY change in future versions## 'sbcs' default value is# charset_table= 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF## 'utf-8' default value is charset_table= 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F# ignored characters list# optional, default value is empty## ignore_chars= U+00AD# minimum word prefix length to index# optional, default is 0 (do not index prefixes)## min_prefix_len= 0# minimum word infix length to index# optional, default is 0 (do not index infixes)## min_infix_len= 0# list of fields to limit prefix/infix indexing to# optional, default value is empty (index all fields in prefix/infix mode)## prefix_fields= filename# infix_fields= url, domain# enable star-syntax (wildcards) when searching prefix/infix indexes# known values are 0 and 1# optional, default is 0 (do not use wildcard syntax)## enable_star= 1# n-gram length to index, for CJK indexing# only supports 0 and 1 for now, other lengths to be implemented# optional, default is 0 (disable n-grams)## ngram_len= 1# n-gram characters list, for CJK indexing# optional, default is empty## ngram_chars= U+3000..U+2FA1F# phrase boundary characters list# optional, default is empty## phrase_boundary= ., ?, !, U+2026 # horizontal ellipsis# phrase boundary word position increment# optional, default is 0## phrase_boundary_step= 100# whether to strip HTML tags from incoming documents# known values are 0 (do not strip) and 1 (do strip)# optional, default is 0html_strip= 0# what HTML attributes to index if stripping HTML# optional, default is empty (do not index anything)## html_index_attrs= img=alt,title; a=title;# what HTML elements contents to strip# optional, default is empty (do not strip element contents)## html_remove_elements= style, script# whether to preopen index data files on startup# optional, default is 0 (do not preopen), searchd-only## preopen= 1# whether to keep dictionary (.spi) on disk, or cache it in RAM# optional, default is 0 (cache in RAM), searchd-only## ondisk_dict= 1# whether to enable in-place inversion (2x less disk, 90-95% speed)# optional, default is 0 (use separate temporary files), indexer-only## inplace_enable= 1# in-place fine-tuning options# optional, defaults are listed below## inplace_hit_gap= 0# preallocated hitlist gap size# inplace_docinfo_gap= 0# preallocated docinfo gap size# inplace_reloc_factor= 0.1# relocation buffer size within arena# inplace_write_factor= 0.1# write buffer size within arena# whether to index original keywords along with stemmed versions# enables "=exactform" operator to work# optional, default is 0## index_exact_words= 1# position increment on overshort (less that min_word_len) words# optional, allowed values are 0 and 1, default is 1## overshort_step= 1# position increment on stopword# optional, allowed values are 0 and 1, default is 1## stopword_step= 1}# inherited index example## all the parameters are copied from the parent index,# and may then be overridden in this index definition#index test1stemmed : test1#{#path= /usr/local/sphinx/var/data/test1stemmed#morphology= stem_en#}# distributed index example## this is a virtual index which can NOT be directly indexed,# and only contains references to other local and/or remote indexes#index dist1#{## 'distributed' index type MUST be specified#type= distributed### local index to be searched## there can be many local indexes configured#local= test1#local= test1stemmed### remote agent## multiple remote agents may be specified## syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'## syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'#agent= localhost:9313:remote1#agent= localhost:9314:remote2,remote3## agent= /var/run/searchd.sock:remote4### blackhole remote agent, for debugging/testing## network errors and search results will be ignored#### agent_blackhole= testbox:9312:testindex1,testindex2#### remote agent connection timeout, milliseconds## optional, default is 1000 ms, ie. 1 sec#agent_connect_timeout= 1000### remote agent query timeout, milliseconds## optional, default is 3000 ms, ie. 3 sec#agent_query_timeout= 3000#}############################################################################### indexer settings#############################################################################indexer{# memory limit, in bytes, kiloytes (16384K) or megabytes (256M)# optional, default is 32M, max is 2047M, recommended is 256M to 1024Mmem_limit=256M# maximum IO calls per second (for I/O throttling)# optional, default is 0 (unlimited)## max_iops= 40# maximum IO call size, bytes (for I/O throttling)# optional, default is 0 (unlimited)## max_iosize= 1048576# maximum xmlpipe2 field length, bytes# optional, default is 2M## max_xmlpipe2_field= 4M# write buffer size, bytes# several (currently up to 4) buffers will be allocated# write buffers are allocated in addition to mem_limit# optional, default is 1M## write_buffer= 1M}############################################################################### searchd settings#############################################################################searchd{# hostname, port, or hostname:port, or /unix/socket/path to listen on# multi-value, multiple listen points are allowed# optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312)## listen= 127.0.0.1# listen= 192.168.0.1:9312# listen= 9312# listen= /var/run/searchd.sock# log file, searchd run info is logged here# optional, default is 'searchd.log'log= /usr/local/sphinx/var/log/searchd.log# query log file, all search queries are logged here# optional, default is empty (do not log queries)query_log= /usr/local/sphinx/var/log/query.log# client read timeout, seconds# optional, default is 5read_timeout= 5# request timeout, seconds# optional, default is 5 minutesclient_timeout= 300# maximum amount of children to fork (concurrent searches to run)# optional, default is 0 (unlimited)max_children= 30# PID file, searchd process ID file name# mandatorypid_file= /usr/local/sphinx/var/log/searchd.pid# max amount of matches the daemon ever keeps in RAM, per-index# WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL# default is 1000 (just like Google)max_matches= 1000# seamless rotate, prevents rotate stalls if precaching huge datasets# optional, default is 1seamless_rotate= 1# whether to forcibly preopen all indexes on startup# optional, default is 0 (do not preopen)preopen_indexes= 0# whether to unlink .old index copies on succesful rotation.# optional, default is 1 (do unlink)unlink_old= 1# attribute updates periodic flush timeout, seconds# updates will be automatically dumped to disk this frequently# optional, default is 0 (disable periodic flush)## attr_flush_period= 900# instance-wide ondisk_dict defaults (per-index value take precedence)# optional, default is 0 (precache all dictionaries in RAM)## ondisk_dict_default= 1# MVA updates pool size# shared between all instances of searchd, disables attr flushes!# optional, default size is 1Mmva_updates_pool= 1M# max allowed network packet size# limits both query packets from clients, and responses from agents# optional, default size is 8Mmax_packet_size= 8M# crash log path# searchd will (try to) log crashed query to 'crash_log_path.PID' file# optional, default is empty (do not create crash logs)## crash_log_path= /usr/local/sphinx/var/log/crash# max allowed per-query filter count# optional, default is 256max_filters= 256# max allowed per-filter values count# optional, default is 4096max_filter_values= 4096# socket listen queue length# optional, default is 5## listen_backlog= 5# per-keyword read buffer size# optional, default is 256K## read_buffer= 256K# unhinted read size (currently used when reading hits)# optional, default is 32K## read_unhinted= 32K}# --eof--
创建一个测试表
CREATE TABLE IF NOT EXISTS `post` (
`id` int(10) unsigned NOT NULL auto_increment,
`title` varchar(200) NOT NULL,
`content` text NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='sphinx测试表' AUTO_INCREMENT=4 ;
插入几条语句
INSERT INTO `post` (`id`, `title`,`content`) VALUES
(1, 'Linux', 'Linux is centos 6.4 ， this is for lamp '),
(2, 'php ', 'php is web sricpt for user,this is smple script, server sricpt.'),
(3, 'mysql', 'mysql is database .this istest for sphinx server.');