lucene 构建个人搜索引擎

时间：2006-12-21 来源：njuguo

用 lucene 构建搜索引擎

从 http://jakarta.apache.org/lucene/docs/index.html 下载 lucene 开发包，我用的是 1.9 版的，呵呵，因为 1.x 和2.0 有些方法有异，2.0 版本的找不到详细的说明文档~

下面就是在 lucene 上构建搜索引擎的一些代码：

package search;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

public class Indexer{

public static void main(String [] args) throws Exception{

File indexDir = new File("C:\\index"); //索引存放要的路径
File dataDir = new File("C:\\arale\\output"); //要索引的文件

long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir); //记录并索引
long end = new Date().getTime();

System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
}

public static int index(File indexDir, File dataDir) throws Exception{
if(!dataDir.exists() || !dataDir.isDirectory()){
System.out.println(dataDir + " does not exist or is not a directory.");
}
IndexWriter writer = new IndexWriter(indexDir, new MMAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
int numIndexed = writer.docCount();
writer.optimize();
writer.close();
return numIndexed;
}

private static void indexDirectory(IndexWriter writer, File dir) throws IOException{
File [] files = dir.listFiles();
for(int i = 0; i < files.length; i++){
File f = files[i];
if(f.isDirectory()){
indexDirectory(writer, f);
}else if( f.getName().endsWith(".htm") ||
f.getName().endsWith(".html") ||
f.getName().endsWith(".txt")){ //暂时只索引这几种文件
indexFile(writer, f);
}
}
}

private static void indexFile(IndexWriter writer, File f) throws IOException{
if(!f.exists()) return;
Document doc = new Document();
doc.add(Field.Text("content", new FileReader(f))); //索引文件
doc.add(Field.Keyword("path", f.getCanonicalPath())); //记录文件名
writer.addDocument(doc);
System.out.println("Indexing " + f.getCanonicalPath() + " OK");
}
}

执行过这个程序后， lucene 就把相应目录下的文件建立了索引(索引存放在指定目录中)。

package search;

import java.io.File;
import java.util.Date;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher{

public static void main(String [] args) throws Exception{
File indexDir = new File("C:\\index");
String q = "钱学森";

if(!indexDir.exists() || !indexDir.isDirectory()){
System.out.println(indexDir + " does not exist or is not a directory.");
}
search(indexDir, q);
}

public static void search(File indexDir, String q) throws Exception{
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "content", new MMAnalyzer());

long start = new Date().getTime();
Hits hits = is.search(query);
long end = new Date().getTime();

System.out.println("Found" + hits.length() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
for(int i = 0; i < hits.length(); i++){
Document doc = hits.doc(i);
System.out.println(doc.get("path"));
}
}
}

执行过这个程序后，lucene 将会打印出一些搜索结果，非常有意思，呵呵，试试吧！
关于 lucene 还有很多知识，比如索引优化，搜索方式，搜索结果排序(评分)，搜索结果关键字高亮显示等等,有兴趣的可以看看这本书《lucene in action》，目前这本书的中文翻译也出版了，
可以去看看哦。