coreseek 英文搜索时将关键词切断的问题
时间:2010-10-28 来源:atyu30
环境: RHEL 5.5 x64
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
问题现象:
[root@wiki mnt]# search category
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2010,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)
using config file '/usr/local/coreseek/etc/csft.conf'...
index 'wiki_main': query 'category ': returned 1000 matches of 1735 total in 0.065 sec
displaying matches:
1. document=7368, weight=2, page_namespace=10, old_id=43577
page_title=Category
page_namespace=10
2. document=1, weight=1, page_namespace=0, old_id=29556
page_title=??
page_namespace=0
3. document=2, weight=1, page_namespace=0, old_id=12858
page_title=Current_events
page_namespace=0
4. document=4, weight=1, page_namespace=0, old_id=12943
page_title=Kernel
page_namespace=0
5. document=21, weight=1, page_namespace=0, old_id=9300
page_title=PF
page_namespace=0
6. document=23, weight=1, page_namespace=0, old_id=18011
page_title=Packages
page_namespace=0
7. document=38, weight=1, page_namespace=0, old_id=3333
page_title=LDAP
page_namespace=0
8. document=41, weight=1, page_namespace=4, old_id=24189
page_title=??
page_namespace=4
9. document=43, weight=1, page_namespace=0, old_id=44245
page_title=Wik-backup
page_namespace=0
10. document=70, weight=1, page_namespace=0, old_id=13103
page_title=????
page_namespace=0
11. document=246, weight=1, page_namespace=0, old_id=44269
page_title=??
page_namespace=0
12. document=252, weight=1, page_namespace=12, old_id=24174
page_title=???????
page_namespace=12
13. document=258, weight=1, page_namespace=12, old_id=24178
page_title=??
page_namespace=12
14. document=263, weight=1, page_namespace=12, old_id=24191
page_title=????
page_namespace=12
15. document=271, weight=1, page_namespace=12, old_id=24177
page_title=????
page_namespace=12
16. document=273, weight=1, page_namespace=12, old_id=24180
page_title=???
page_namespace=12
17. document=294, weight=1, page_namespace=0, old_id=41390
page_title=Administrators
page_namespace=0
18. document=300, weight=1, page_namespace=0, old_id=18219
page_title=??
page_namespace=0
19. document=308, weight=1, page_namespace=0, old_id=795
page_title=???SSL_VPN
page_namespace=0
20. document=331, weight=1, page_namespace=0, old_id=840
page_title=??
page_namespace=0
words:
1. 'categori': 1735 documents, 2645 hits
index 'wiki_incremental': query 'category ': returned 1 matches of 1 total in 0.007 sec
displaying matches:
1. document=409, weight=1, page_namespace=0, old_id=44398
page_title=OpenBSD??OpenSSH??SSH???
page_namespace=0
words:
1. 'categori': 1 documents, 3 hits
[root@wiki mnt]#
搜索中文正常
[root@wiki mnt]# search 归档日志
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2010,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)
using config file '/usr/local/coreseek/etc/csft.conf'...
index 'wiki_main': query '归档日志 ': returned 390 matches of 390 total in 0.011 sec
displaying matches:
1. document=9073, weight=2, page_namespace=0, old_id=27422
page_title=RMAN????????RMAN-06059??????
page_namespace=0
2. document=9163, weight=2, page_namespace=0, old_id=30658
page_title=?????????????
page_namespace=0
3. document=10660, weight=2, page_namespace=0, old_id=20002
page_title=?????????????????????
page_namespace=0
4. document=10872, weight=2, page_namespace=0, old_id=44284
page_title=??????????????
page_namespace=0
5. document=15395, weight=2, page_namespace=0, old_id=36425
page_title=Oracle???????????
page_namespace=0
6. document=16402, weight=2, page_namespace=0, old_id=31181
page_title=????DG???????
page_namespace=0
7. document=16785, weight=2, page_namespace=0, old_id=44270
page_title=Oracle_10g_???????????
page_namespace=0
8. document=16798, weight=2, page_namespace=0, old_id=32039
page_title=Oracle_10g_?????Archive_Log_?
page_namespace=0
9. document=17508, weight=2, page_namespace=0, old_id=33279
page_title=?restore_archivelog???????????
page_namespace=0
10. document=17746, weight=2, page_namespace=0, old_id=44273
page_title=RMAN??????????
page_namespace=0
11. document=18224, weight=2, page_namespace=0, old_id=34858
page_title=RMAN???????????????
page_namespace=0
12. document=324, weight=1, page_namespace=0, old_id=43429
page_title=A??????????apache???cronolog??
page_namespace=0
13. document=1876, weight=1, page_namespace=0, old_id=41767
page_title=Linux_????????
page_namespace=0
14. document=4062, weight=1, page_namespace=0, old_id=40270
page_title=????????
page_namespace=0
15. document=4460, weight=1, page_namespace=0, old_id=7986
page_title=??_Subversion??
page_namespace=0
16. document=4949, weight=1, page_namespace=0, old_id=8653
page_title=Windows?Subversion???????
page_namespace=0
17. document=5077, weight=1, page_namespace=0, old_id=44042
page_title=Oracle???????
page_namespace=0
18. document=6062, weight=1, page_namespace=0, old_id=38080
page_title=Apache????/??/??
page_namespace=0
19. document=6145, weight=1, page_namespace=0, old_id=17752
page_title=Linux_????????????
page_namespace=0
20. document=6721, weight=1, page_namespace=0, old_id=35340
page_title=Windows_XP_+_VMWare_Server_1.0.6_+_CentOS_5.2_+_Oracle_10g_Rac_(10.2.0.1)??
page_namespace=0
words:
1. '归档日志': 390 documents, 1533 hits
index 'wiki_incremental': query '归档日志 ': returned 0 matches of 0 total in 0.001 sec
words:
1. '归档日志': 0 documents, 0 hits
配置文件
source src_wiki_main
{
# data source
type = mysql
sql_host = localhost
sql_user = atyu30
sql_pass = atyu30
sql_db = wikidb
# these two are optional
sql_port = 3306
sql_sock = /var/lib/mysql/mysql.sock
# pre-query, executed before the main fetch query
sql_query_pre = SET NAMES utf8
sql_query_pre = SET SESSION query_cache_type=OFF
#phrase_boundary = ., ?, U+2026
# main document fetch query - change the table names if you are using a prefix
sql_query = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id
# attribute columns
sql_attr_uint = page_namespace
sql_attr_uint = old_id
# uncomment next line to collect all category ids for a category filter
#sql_attr_multi = uint category from query; SELECT cl_from, page_id AS category FROM categorylinks, page WHERE page_title=cl_to AND page_namespace=14
# optional - used by command-line search utility to display document information
sql_query_info = SELECT page_title, page_namespace FROM page WHERE page_id=$id
}
source src_wiki_incremental : src_wiki_main
{
# adjust this query based on the time you run the full index
# in this case, full index runs at 3 AM (server time) which translates to 7 AM UTC
sql_query = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id AND page_touched>=DATE_FORMAT(CURDATE(), '%Y%m%d070000')
# all other parameters are copied from the parent source,
}
index wiki_main
{
# which document source to index
source = src_wiki_main
# this is path and index file name without extension
# you may need to change this path or create this folder
path = /usr/local/coreseek/var/data/wiki_main
# docinfo (ie. per-document attribute values) storage strategy
docinfo = extern
# morphology
morphology = stem_en
# stopwords file
stopwords = /usr/local/coreseek/lib/stopwords.txt
# charset dict
charset_dictpath = /usr/local/coreseek/lib/
# minimum word length
min_word_len = 2
# uncomment next 2 lines to allow wildcard (*) searches
min_infix_len = 1
enable_star = 1
# charset encoding type
#charset_type = utf-8
charset_type = zh_cn.utf-8
ignore_chars = U+AD
# charset definition and case folding rules "table"
}
index wiki_incremental : wiki_main
{
path = /usr/local/coreseek/var/data/wiki_incremental
source = src_wiki_incremental
}
indexer
{
# memory limit (default is 32M)
mem_limit = 128M
}
searchd
{
# IP address on which search daemon will bind and accept
# optional, default is to listen on all addresses,
# port on which search daemon will listen
#listen = 127.0.0.1
#listen = 192.168.36.139
listen = 3312
#listen = /var/run/searchd.sock
# PID file, searchd process ID file name
# mandatory
pid_file = /usr/local/coreseek/var/log/searchd.pid
# searchd run info is logged here - create or change the folder
log = /usr/local/coreseek/var/log/coreseek/searchd.log
# all the search queries are logged here
query_log = /usr/local/coreseek/var/log/coreseek/query.log
# client read timeout, seconds
read_timeout = 5
# maximum amount of children to fork
max_children = 30
#phrase_boundary = ., ?, U+2026
#html_strip = 1
# maximum amount of matches this daemon would ever retrieve
# from each index and serve to client
max_matches = 1000
}
处理办法:
乔楚开源搜索(5601680) 21:58:28:
morphology = stem_en
min_infix_len = 1
enable_star = 1
ignore_chars = U+AD
全部删掉
1. 关闭searchd
2. 重新索引
3. 重启searchd
恢复正常
感谢 乔楚开源搜索 的帮助
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
问题现象:
[root@wiki mnt]# search category
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2010,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)
using config file '/usr/local/coreseek/etc/csft.conf'...
index 'wiki_main': query 'category ': returned 1000 matches of 1735 total in 0.065 sec
displaying matches:
1. document=7368, weight=2, page_namespace=10, old_id=43577
page_title=Category
page_namespace=10
2. document=1, weight=1, page_namespace=0, old_id=29556
page_title=??
page_namespace=0
3. document=2, weight=1, page_namespace=0, old_id=12858
page_title=Current_events
page_namespace=0
4. document=4, weight=1, page_namespace=0, old_id=12943
page_title=Kernel
page_namespace=0
5. document=21, weight=1, page_namespace=0, old_id=9300
page_title=PF
page_namespace=0
6. document=23, weight=1, page_namespace=0, old_id=18011
page_title=Packages
page_namespace=0
7. document=38, weight=1, page_namespace=0, old_id=3333
page_title=LDAP
page_namespace=0
8. document=41, weight=1, page_namespace=4, old_id=24189
page_title=??
page_namespace=4
9. document=43, weight=1, page_namespace=0, old_id=44245
page_title=Wik-backup
page_namespace=0
10. document=70, weight=1, page_namespace=0, old_id=13103
page_title=????
page_namespace=0
11. document=246, weight=1, page_namespace=0, old_id=44269
page_title=??
page_namespace=0
12. document=252, weight=1, page_namespace=12, old_id=24174
page_title=???????
page_namespace=12
13. document=258, weight=1, page_namespace=12, old_id=24178
page_title=??
page_namespace=12
14. document=263, weight=1, page_namespace=12, old_id=24191
page_title=????
page_namespace=12
15. document=271, weight=1, page_namespace=12, old_id=24177
page_title=????
page_namespace=12
16. document=273, weight=1, page_namespace=12, old_id=24180
page_title=???
page_namespace=12
17. document=294, weight=1, page_namespace=0, old_id=41390
page_title=Administrators
page_namespace=0
18. document=300, weight=1, page_namespace=0, old_id=18219
page_title=??
page_namespace=0
19. document=308, weight=1, page_namespace=0, old_id=795
page_title=???SSL_VPN
page_namespace=0
20. document=331, weight=1, page_namespace=0, old_id=840
page_title=??
page_namespace=0
words:
1. 'categori': 1735 documents, 2645 hits
index 'wiki_incremental': query 'category ': returned 1 matches of 1 total in 0.007 sec
displaying matches:
1. document=409, weight=1, page_namespace=0, old_id=44398
page_title=OpenBSD??OpenSSH??SSH???
page_namespace=0
words:
1. 'categori': 1 documents, 3 hits
[root@wiki mnt]#
搜索中文正常
[root@wiki mnt]# search 归档日志
Coreseek Fulltext 3.2 [ Sphinx 0.9.9-release (r2117)]
Copyright (c) 2007-2010,
Beijing Choice Software Technologies Inc (http://www.coreseek.com)
using config file '/usr/local/coreseek/etc/csft.conf'...
index 'wiki_main': query '归档日志 ': returned 390 matches of 390 total in 0.011 sec
displaying matches:
1. document=9073, weight=2, page_namespace=0, old_id=27422
page_title=RMAN????????RMAN-06059??????
page_namespace=0
2. document=9163, weight=2, page_namespace=0, old_id=30658
page_title=?????????????
page_namespace=0
3. document=10660, weight=2, page_namespace=0, old_id=20002
page_title=?????????????????????
page_namespace=0
4. document=10872, weight=2, page_namespace=0, old_id=44284
page_title=??????????????
page_namespace=0
5. document=15395, weight=2, page_namespace=0, old_id=36425
page_title=Oracle???????????
page_namespace=0
6. document=16402, weight=2, page_namespace=0, old_id=31181
page_title=????DG???????
page_namespace=0
7. document=16785, weight=2, page_namespace=0, old_id=44270
page_title=Oracle_10g_???????????
page_namespace=0
8. document=16798, weight=2, page_namespace=0, old_id=32039
page_title=Oracle_10g_?????Archive_Log_?
page_namespace=0
9. document=17508, weight=2, page_namespace=0, old_id=33279
page_title=?restore_archivelog???????????
page_namespace=0
10. document=17746, weight=2, page_namespace=0, old_id=44273
page_title=RMAN??????????
page_namespace=0
11. document=18224, weight=2, page_namespace=0, old_id=34858
page_title=RMAN???????????????
page_namespace=0
12. document=324, weight=1, page_namespace=0, old_id=43429
page_title=A??????????apache???cronolog??
page_namespace=0
13. document=1876, weight=1, page_namespace=0, old_id=41767
page_title=Linux_????????
page_namespace=0
14. document=4062, weight=1, page_namespace=0, old_id=40270
page_title=????????
page_namespace=0
15. document=4460, weight=1, page_namespace=0, old_id=7986
page_title=??_Subversion??
page_namespace=0
16. document=4949, weight=1, page_namespace=0, old_id=8653
page_title=Windows?Subversion???????
page_namespace=0
17. document=5077, weight=1, page_namespace=0, old_id=44042
page_title=Oracle???????
page_namespace=0
18. document=6062, weight=1, page_namespace=0, old_id=38080
page_title=Apache????/??/??
page_namespace=0
19. document=6145, weight=1, page_namespace=0, old_id=17752
page_title=Linux_????????????
page_namespace=0
20. document=6721, weight=1, page_namespace=0, old_id=35340
page_title=Windows_XP_+_VMWare_Server_1.0.6_+_CentOS_5.2_+_Oracle_10g_Rac_(10.2.0.1)??
page_namespace=0
words:
1. '归档日志': 390 documents, 1533 hits
index 'wiki_incremental': query '归档日志 ': returned 0 matches of 0 total in 0.001 sec
words:
1. '归档日志': 0 documents, 0 hits
配置文件
source src_wiki_main
{
# data source
type = mysql
sql_host = localhost
sql_user = atyu30
sql_pass = atyu30
sql_db = wikidb
# these two are optional
sql_port = 3306
sql_sock = /var/lib/mysql/mysql.sock
# pre-query, executed before the main fetch query
sql_query_pre = SET NAMES utf8
sql_query_pre = SET SESSION query_cache_type=OFF
#phrase_boundary = ., ?, U+2026
# main document fetch query - change the table names if you are using a prefix
sql_query = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id
# attribute columns
sql_attr_uint = page_namespace
sql_attr_uint = old_id
# uncomment next line to collect all category ids for a category filter
#sql_attr_multi = uint category from query; SELECT cl_from, page_id AS category FROM categorylinks, page WHERE page_title=cl_to AND page_namespace=14
# optional - used by command-line search utility to display document information
sql_query_info = SELECT page_title, page_namespace FROM page WHERE page_id=$id
}
source src_wiki_incremental : src_wiki_main
{
# adjust this query based on the time you run the full index
# in this case, full index runs at 3 AM (server time) which translates to 7 AM UTC
sql_query = SELECT page_id, page_title, page_namespace, old_id, old_text FROM page, revision, text WHERE rev_id=page_latest AND old_id=rev_text_id AND page_touched>=DATE_FORMAT(CURDATE(), '%Y%m%d070000')
# all other parameters are copied from the parent source,
}
index wiki_main
{
# which document source to index
source = src_wiki_main
# this is path and index file name without extension
# you may need to change this path or create this folder
path = /usr/local/coreseek/var/data/wiki_main
# docinfo (ie. per-document attribute values) storage strategy
docinfo = extern
# morphology
morphology = stem_en
# stopwords file
stopwords = /usr/local/coreseek/lib/stopwords.txt
# charset dict
charset_dictpath = /usr/local/coreseek/lib/
# minimum word length
min_word_len = 2
# uncomment next 2 lines to allow wildcard (*) searches
min_infix_len = 1
enable_star = 1
# charset encoding type
#charset_type = utf-8
charset_type = zh_cn.utf-8
ignore_chars = U+AD
# charset definition and case folding rules "table"
}
index wiki_incremental : wiki_main
{
path = /usr/local/coreseek/var/data/wiki_incremental
source = src_wiki_incremental
}
indexer
{
# memory limit (default is 32M)
mem_limit = 128M
}
searchd
{
# IP address on which search daemon will bind and accept
# optional, default is to listen on all addresses,
# port on which search daemon will listen
#listen = 127.0.0.1
#listen = 192.168.36.139
listen = 3312
#listen = /var/run/searchd.sock
# PID file, searchd process ID file name
# mandatory
pid_file = /usr/local/coreseek/var/log/searchd.pid
# searchd run info is logged here - create or change the folder
log = /usr/local/coreseek/var/log/coreseek/searchd.log
# all the search queries are logged here
query_log = /usr/local/coreseek/var/log/coreseek/query.log
# client read timeout, seconds
read_timeout = 5
# maximum amount of children to fork
max_children = 30
#phrase_boundary = ., ?, U+2026
#html_strip = 1
# maximum amount of matches this daemon would ever retrieve
# from each index and serve to client
max_matches = 1000
}
处理办法:
乔楚开源搜索(5601680) 21:58:28:
morphology = stem_en
min_infix_len = 1
enable_star = 1
ignore_chars = U+AD
全部删掉
1. 关闭searchd
2. 重新索引
3. 重启searchd
恢复正常
感谢 乔楚开源搜索 的帮助
相关阅读 更多 +