文章详情

  • 游戏榜单
  • 软件榜单
关闭导航
热搜榜
热门下载
热门标签
php爱好者> php文档>coreseek 英文搜索时将关键词切断的问题

coreseek 英文搜索时将关键词切断的问题

时间:2010-10-28  来源:atyu30

环境: RHEL 5.5 x64
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]


问题现象:
[root@wiki mnt]# search category
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2010,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)

 using config file '/usr/local/coreseek/etc/csft.conf'...
index 'wiki_main': query 'category ': returned 1000 matches of 1735 total in 0.065 sec

displaying matches:
1. document=7368, weight=2, page_namespace=10, old_id=43577
        page_title=Category
        page_namespace=10
2. document=1, weight=1, page_namespace=0, old_id=29556
        page_title=??
        page_namespace=0
3. document=2, weight=1, page_namespace=0, old_id=12858
        page_title=Current_events
        page_namespace=0
4. document=4, weight=1, page_namespace=0, old_id=12943
        page_title=Kernel
        page_namespace=0
5. document=21, weight=1, page_namespace=0, old_id=9300
        page_title=PF
        page_namespace=0
6. document=23, weight=1, page_namespace=0, old_id=18011
        page_title=Packages
        page_namespace=0
7. document=38, weight=1, page_namespace=0, old_id=3333
        page_title=LDAP
        page_namespace=0
8. document=41, weight=1, page_namespace=4, old_id=24189
        page_title=??
        page_namespace=4
9. document=43, weight=1, page_namespace=0, old_id=44245
        page_title=Wik-backup
        page_namespace=0
10. document=70, weight=1, page_namespace=0, old_id=13103
        page_title=????
        page_namespace=0
11. document=246, weight=1, page_namespace=0, old_id=44269
        page_title=??
        page_namespace=0
12. document=252, weight=1, page_namespace=12, old_id=24174
        page_title=???????
        page_namespace=12
13. document=258, weight=1, page_namespace=12, old_id=24178
        page_title=??
        page_namespace=12
14. document=263, weight=1, page_namespace=12, old_id=24191
        page_title=????
        page_namespace=12
15. document=271, weight=1, page_namespace=12, old_id=24177
        page_title=????
        page_namespace=12
16. document=273, weight=1, page_namespace=12, old_id=24180
        page_title=???
        page_namespace=12
17. document=294, weight=1, page_namespace=0, old_id=41390
        page_title=Administrators
        page_namespace=0
18. document=300, weight=1, page_namespace=0, old_id=18219
        page_title=??
        page_namespace=0
19. document=308, weight=1, page_namespace=0, old_id=795
        page_title=???SSL_VPN
        page_namespace=0
20. document=331, weight=1, page_namespace=0, old_id=840
        page_title=??
        page_namespace=0

words:
1. 'categori': 1735 documents, 2645 hits

index 'wiki_incremental': query 'category ': returned 1 matches of 1 total in 0.007 sec

displaying matches:
1. document=409, weight=1, page_namespace=0, old_id=44398
        page_title=OpenBSD??OpenSSH??SSH???
        page_namespace=0

words:
1. 'categori': 1 documents, 3 hits

[root@wiki mnt]#

搜索中文正常

[root@wiki mnt]# search 归档日志
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2010,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)

 using config file '/usr/local/coreseek/etc/csft.conf'...
index 'wiki_main': query '归档日志 ': returned 390 matches of 390 total in 0.011 sec

displaying matches:
1. document=9073, weight=2, page_namespace=0, old_id=27422
        page_title=RMAN????????RMAN-06059??????
        page_namespace=0
2. document=9163, weight=2, page_namespace=0, old_id=30658
        page_title=?????????????
        page_namespace=0
3. document=10660, weight=2, page_namespace=0, old_id=20002
        page_title=?????????????????????
        page_namespace=0
4. document=10872, weight=2, page_namespace=0, old_id=44284
        page_title=??????????????
        page_namespace=0
5. document=15395, weight=2, page_namespace=0, old_id=36425
        page_title=Oracle???????????
        page_namespace=0
6. document=16402, weight=2, page_namespace=0, old_id=31181
        page_title=????DG???????
        page_namespace=0
7. document=16785, weight=2, page_namespace=0, old_id=44270
        page_title=Oracle_10g_???????????
        page_namespace=0
8. document=16798, weight=2, page_namespace=0, old_id=32039
        page_title=Oracle_10g_?????Archive_Log_?
        page_namespace=0
9. document=17508, weight=2, page_namespace=0, old_id=33279
        page_title=?restore_archivelog???????????
        page_namespace=0
10. document=17746, weight=2, page_namespace=0, old_id=44273
        page_title=RMAN??????????
        page_namespace=0
11. document=18224, weight=2, page_namespace=0, old_id=34858
        page_title=RMAN???????????????
        page_namespace=0
12. document=324, weight=1, page_namespace=0, old_id=43429
        page_title=A??????????apache???cronolog??
        page_namespace=0
13. document=1876, weight=1, page_namespace=0, old_id=41767
        page_title=Linux_????????
        page_namespace=0
14. document=4062, weight=1, page_namespace=0, old_id=40270
        page_title=????????
        page_namespace=0
15. document=4460, weight=1, page_namespace=0, old_id=7986
        page_title=??_Subversion??
        page_namespace=0
16. document=4949, weight=1, page_namespace=0, old_id=8653
        page_title=Windows?Subversion???????
        page_namespace=0
17. document=5077, weight=1, page_namespace=0, old_id=44042
        page_title=Oracle???????
        page_namespace=0
18. document=6062, weight=1, page_namespace=0, old_id=38080
        page_title=Apache????/??/??
        page_namespace=0
19. document=6145, weight=1, page_namespace=0, old_id=17752
        page_title=Linux_????????????
        page_namespace=0
20. document=6721, weight=1, page_namespace=0, old_id=35340
        page_title=Windows_XP_+_VMWare_Server_1.0.6_+_CentOS_5.2_+_Oracle_10g_Rac_(10.2.0.1)??
        page_namespace=0

words:
1. '归档日志': 390 documents, 1533 hits

index 'wiki_incremental': query '归档日志 ': returned 0 matches of 0 total in 0.001 sec

words:
1. '归档日志': 0 documents, 0 hits

配置文件

source src_wiki_main
{
        # data source
        type            = mysql
        sql_host        = localhost
        sql_user        = atyu30
        sql_pass        = atyu30
        sql_db          = wikidb
        # these two are optional
        sql_port        = 3306
        sql_sock        = /var/lib/mysql/mysql.sock
        # pre-query, executed before the main fetch query
        sql_query_pre   = SET NAMES utf8
        sql_query_pre   = SET SESSION query_cache_type=OFF
        #phrase_boundary = ., ?, U+2026
        # main document fetch query - change the table names if you are using a prefix
        sql_query       = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id
        # attribute columns
        sql_attr_uint   = page_namespace
        sql_attr_uint   = old_id
        # uncomment next line to collect all category ids for a category filter
        #sql_attr_multi  = uint category from query; SELECT cl_from, page_id AS category FROM categorylinks, page WHERE page_title=cl_to AND page_namespace=14
        # optional - used by command-line search utility to display document information
        sql_query_info  = SELECT page_title, page_namespace FROM page WHERE page_id=$id
}
source src_wiki_incremental : src_wiki_main
{
        # adjust this query based on the time you run the full index
        # in this case, full index runs at 3 AM (server time) which translates to 7 AM UTC
        sql_query       = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id AND page_touched>=DATE_FORMAT(CURDATE(), '%Y%m%d070000')
        # all other parameters are copied from the parent source,
}
index wiki_main
{
        # which document source to index
        source          = src_wiki_main
        # this is path and index file name without extension
        # you may need to change this path or create this folder
        path            = /usr/local/coreseek/var/data/wiki_main
        # docinfo (ie. per-document attribute values) storage strategy
        docinfo         = extern
        # morphology
        morphology      = stem_en
        # stopwords file
        stopwords                               = /usr/local/coreseek/lib/stopwords.txt
        # charset dict
        charset_dictpath                        = /usr/local/coreseek/lib/
        # minimum word length
        min_word_len    = 2
        # uncomment next 2 lines to allow wildcard (*) searches
        min_infix_len = 1
        enable_star = 1
        # charset encoding type
        #charset_type   = utf-8
        charset_type    = zh_cn.utf-8
        ignore_chars = U+AD
        # charset definition and case folding rules "table"
}
index wiki_incremental : wiki_main
{
        path            = /usr/local/coreseek/var/data/wiki_incremental
        source          = src_wiki_incremental
}
indexer
{
        # memory limit (default is 32M)
        mem_limit       = 128M
}
searchd
{
        # IP address on which search daemon will bind and accept
        # optional, default is to listen on all addresses,
        # port on which search daemon will listen
        #listen                                = 127.0.0.1
        #listen                                = 192.168.36.139
        listen                                = 3312
        #listen                                = /var/run/searchd.sock
        # PID file, searchd process ID file name
        # mandatory
        pid_file                        = /usr/local/coreseek/var/log/searchd.pid
        # searchd run info is logged here - create or change the folder
        log             = /usr/local/coreseek/var/log/coreseek/searchd.log
        # all the search queries are logged here
        query_log       = /usr/local/coreseek/var/log/coreseek/query.log
        # client read timeout, seconds
        read_timeout    = 5
        # maximum amount of children to fork
        max_children    = 30
        #phrase_boundary = ., ?, U+2026
        #html_strip = 1
        # maximum amount of matches this daemon would ever retrieve
        # from each index and serve to client
        max_matches     = 1000
}

处理办法:
乔楚开源搜索(5601680) 21:58:28:
morphology      = stem_en
min_infix_len = 1
enable_star = 1
ignore_chars = U+AD
全部删掉

1. 关闭searchd
2. 重新索引
3. 重启searchd

恢复正常
感谢 乔楚开源搜索 的帮助

相关阅读 更多 +
排行榜 更多 +
辰域智控app

辰域智控app

系统工具 下载
网医联盟app

网医联盟app

运动健身 下载
汇丰汇选App

汇丰汇选App

金融理财 下载