Sphinx的一个应用实例

来源:互联网 发布:淘宝美工学校 编辑:程序博客网 时间:2024/05/17 04:22
我把我blog的数据(中英文混合)导出, 作为数据来源, 来说明sphinx的使用.

准备数据源

导入数据:
mysql -u root -p test < wp_posts.sql

配置Sphinx
配置data source:
  1. source blog
  2. {
  3.         type            =       mysql
  4.         sql_host        =       localhost
  5.         sql_user        =       root
  6.         sql_pass        =       xxxx
  7.         sql_db          =       test
  8.         sql_port        =       3306
  9.         sql_query       =       /
  10.                 SELECT ID, post_author, UNIX_TIMESTAMP(post_date) as date_added, post_content from wp_posts
  11.         sql_attr_uint   =       post_author
  12.         sql_attr_timestamp =    date_added
  13.         sql_query_info  =       SELECT * FROM wp_posts where ID=$id
  14. }
配置index:

  1. index blog
  2. {
  3.         source                                  = blog
  4.         path                                    = /usr/local/sphinx/var/data/blog
  5.         docinfo                                 = extern
  6.         charset_type                    = zh_cn.utf-8
  7.         charset_dictpath                = /usr/local/sphinx/dict
  8. }
注意: 字典的目录要加上.


索引

./bin/indexer --config ./etc/sphinx.conf blog

如果没有错误, 会看到:

Coreseek Full Text Server 3.0(beta)
 Copyright (c) 2006-2008 coreseek.com
using config file './etc/sphinx.conf'...
indexing index 'blog'...
collected 165 docs, 0.2 MB
sorted 0.0 Mhits, 100.0% done
total 165 docs, 164834 bytes
total 0.099 sec, 1670067.52 bytes/sec, 1671.75 docs/sec


测试搜索

./bin/search -c ./etc/sphinx.conf -i blog 苹果

输出:

Coreseek Full Text Server 3.0(beta)
 Copyright (c) 2006-2008 coreseek.com
using config file './etc/sphinx.conf'...
0x815e1f8index 'blog': query '苹果 ': returned 1 matches of 1 total in 0.005 sec

displaying matches:
1. document=140, weight=1, post_author=1, date_added=Fri Nov 30 11:22:02 2007
ID=140
post_author=1
post_date=2007-11-30 11:22:02
post_date_gmt=2007-11-30 03:22:02
post_content=This is the text of the Commencement address by Steve Jobs, CEO of Apple Computer and of Pixar Animation Studios, delivered on June 12, 2005.

....
{中间的blog贴内容省略}
....

post_title=You've got to find what you love
post_category=0
post_excerpt=
post_status=publish
comment_status=open
ping_status=open
post_password=
post_name=youve-got-to-find-what-you-love
to_ping=
pinged=
post_modified=2008-06-20 11:58:53
post_modified_gmt=2008-06-20 03:58:53
post_content_filtered=
post_parent=0
guid=http://blog.funcat.cn/?p=164
menu_order=0
post_type=post
post_mime_type=
comment_count=0

words:
1. '苹果': 1 documents, 10 hits


启动searchd

./bin/searchd

Coreseek Full Text Server 3.0(beta)
 Copyright (c) 2006-2008 coreseek.com
using config file '/usr/local/sphinx/etc/sphinx.conf'...


使用调用API


  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import sys
  4. if sys.getdefaultencoding() != 'utf-8'
  5.     reload(sys) 
  6.     sys.setdefaultencoding('utf-8')
  7. import web
  8. from web.contrib.template import render_mako
  9. import MySQLdb
  10. from MySQLdb import *
  11. from sphinxapi import *
  12. urls = (
  13.     '/''index',
  14. )
  15. render = render_mako(
  16.         directories=['templates'],
  17.         input_encoding='utf-8',
  18.         output_encoding='utf-8',
  19.         )
  20. app = web.application(urls, globals())
  21. con = MySQLdb.Connect(host="localhost", port=3306, user="root", passwd="xixihaha", db="blogdata")
  22. class index:
  23.     def GET(self):
  24.         r_info = ''
  25.         info = ''
  26.         s_result = ''
  27.         return render.index(r_info=r_info, e_info=info, s_result=s_result)
  28.     def POST(self):
  29.         i = web.input()
  30.         if i.keyword == '':
  31.             raise web.seeohter('/')
  32.         
  33.         e_info = ''
  34.         r_info = ''
  35.         s_result = ''
  36.         
  37.         q = i.keyword
  38.         
  39.         cl = SphinxClient()
  40.         cl.SetServer ( 'localhost', 3312 )
  41.         res = cl.Query ( q, 'blog' )
  42.         if not res:
  43.             e_info = 'query failed: %s' % cl.GetLastError()
  44.             
  45.         if cl.GetLastWarning():
  46.             e_info = 'WARNING: %s/n' % cl.GetLastWarning()
  47.         if res.has_key('words'):
  48.             for info in res['words']:
  49.                 r_info += '/t/'%s/' found %d times in %d documents<br/>' % (info['word'], info['hits'], info['docs'])
  50.                 
  51.         if res.has_key('matches'):
  52.             n = 1
  53.             s_result = '/nMatches:<br/>'
  54.             
  55.             import time
  56.             
  57.             print res['matches']
  58.             
  59.             for match in res['matches']:
  60.                 attrsdump = ''
  61.                 for attr in res['attrs']:
  62.                     attrname = attr[0]
  63.                     attrtype = attr[1]
  64.                     value = match['attrs'][attrname]
  65.                     if attrtype==SPH_ATTR_TIMESTAMP:
  66.                         value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) )
  67.                     attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value )
  68.         
  69.                     s_result += '%d. doc_id=%s, weight=%d%s<br/>' % (n, match['id'], match['weight'], attrsdump)
  70.                 n += 1
  71.                 Cursor = con.cursor()
  72.                 
  73.                 Cursor.execute('select post_content from wp_posts where id = %s' % match['id'])
  74.                 re = Cursor.fetchall()
  75.                 s_result += re[0][0]
  76.                 s_result += '<hr/>'
  77.         
  78.         return render.index(r_info=r_info, e_info=info, s_result=s_result)
  79.         
  80. if __name__ == '__main__':
  81.     app.run()

原创粉丝点击