使用zend Framework的lucene进行全文检索——中文分词
时间:2010-09-20 来源:Dufe王彬
zf本身没有提供中文分词算法,具体应用中要自己写。我这里使用简单的二元分词算法(只在utf-8下工作正常,对于其他字符集,请修改程序)。
第一步、如何测试分词算法的输出。
在zf 的手册中没有提到,我这里简单给个例子:
<?php
$analyzer=Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$value='this is a test!';
$analyzer->setInput($value,'utf-8');
$position =0;
$tokenCounter=0;
while (($token=$analyzer->nextToken()) !==null) {
$tokenCounter++;
$tokens[] =$token;
}
print_r($tokens);
?>
这里使用是zf默认的分词算法Zend_Search_Lucene_Analysis_Analyzer_Common_Text。另外你可以加上一个过滤方法。比如说过滤一些单词,比如“is”,"a "之类的。
第二步、自定义自己的分词算法,可以参考手册,或者自己看Zend_Search_Lucene_Analysis_Analyzer_Common_Text类的实现。
其中要注意的是过滤这点。由于我们的分词是二元分词,如果要过滤一些比如“的”、“啊”之类的单词,是无法使用内置的Tokens Filtering。我们需要是分词前先过滤调。这个可以在reset()里面实现
例子。
<?
require_once'Zend/Search/Lucene/Analysis/Analyzer.php';
classPhpbean_Lucene_AnalyzerextendsZend_Search_Lucene_Analysis_Analyzer_Common{
private$_position;
private$_cnStopWords= array();
public functionsetCnStopWords($cnStopWords){
$this->_cnStopWords=$cnStopWords;
}
/**
* Reset token stream
*/
public functionreset()
{
$this->_position=0;
$search= array(",","/","\", ".", ";", ":", ""","!","~","`","^","(",")","?","-","t","n","'","<",">","r","rn","$","&","%","#","@","+","=","{","}","[","]",":",")","(",".","。",",","!",";","“","”","‘","’","[","]","、","—"," ","《","》","-","…","【","】",);
$this->_input=str_replace($search,' ',$this->_input);
$this->_input=str_replace($this->_cnStopWords,' ',$this->_input);
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public functionnextToken()
{
if ($this->_input===null) {
returnnull;
}
while ($this->_position<strlen($this->_input)) {
while ($this->_position<strlen($this->_input) &&
$this->_input[$this->_position]==' ') {
$this->_position++;
}
$termStartPosition=$this->_position;
$temp_char=$this->_input[$this->_position];
$isCnWord=false;
if(ord($temp_char)>127){
$i=0;
while ($this->_position<strlen($this->_input) &&
ord($this->_input[$this->_position] )>127) {
$this->_position=$this->_position+3;
$i++;
if($i==2){
$isCnWord=true;
break;
}
}
if($i==1)continue;
}else{
while ($this->_position<strlen($this->_input) &&
ctype_alnum($this->_input[$this->_position] )) {
$this->_position++;
}
}
if ($this->_position==$termStartPosition) {
returnnull;
}
$token= newZend_Search_Lucene_Analysis_Token(
substr($this->_input,
$termStartPosition,
$this->_position-$termStartPosition),
$termStartPosition,
$this->_position);
$token=$this->normalize($token);
if($isCnWord)$this->_position=$this->_position-3;
if ($token!==null) {
return$token;
}
}
returnnull;
}
}
?>
测试分词输出demo
<?
$stopWords= array('a','an','at','the','and','or','is','am');
$stopWordsFilter= newZend_Search_Lucene_Analysis_TokenFilter_StopWords($stopWords);
$analyzer= newPhpbean_Lucene_Analyzer();
$cnStopWords= array('的');
$analyzer->setCnStopWords($cnStopWords);
$analyzer->addFilter($stopWordsFilter);
$value='this is " a test【中文】的测试';
$analyzer->setInput($value,'utf-8');
$position =0;
$tokenCounter=0;
while (($token=$analyzer->nextToken()) !==null) {
$tokenCounter++;
$tokens[] =$token;
}
print_r($tokens);
?>
比如上面的输出就是"this" "test" "中文" “测试”四个结果。符合我们的需要。
相关阅读 更多 +