lucene 构建个人搜索引擎
时间:2006-12-21 来源:njuguo
用 lucene 构建搜索引擎
从 http://jakarta.apache.org/lucene/docs/index.html 下载 lucene 开发包, 我用的是 1.9 版的,呵呵,因为 1.x 和2.0 有些方法有异,2.0 版本的找不到详细的说明文档~
下面就是在 lucene 上构建搜索引擎的一些代码:
package search;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class Indexer{
public static void main(String [] args) throws Exception{
File indexDir = new File("C:\\index"); //索引存放要的路径
File dataDir = new File("C:\\arale\\output"); //要索引的文件
long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir); //记录并索引
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
}
public static int index(File indexDir, File dataDir) throws Exception{
if(!dataDir.exists() || !dataDir.isDirectory()){
System.out.println(dataDir + " does not exist or is not a directory.");
}
IndexWriter writer = new IndexWriter(indexDir, new MMAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
int numIndexed = writer.docCount();
writer.optimize();
writer.close();
return numIndexed;
}
private static void indexDirectory(IndexWriter writer, File dir) throws IOException{
File [] files = dir.listFiles();
for(int i = 0; i < files.length; i++){
File f = files[i];
if(f.isDirectory()){
indexDirectory(writer, f);
}else if( f.getName().endsWith(".htm") ||
f.getName().endsWith(".html") ||
f.getName().endsWith(".txt")){ //暂时只索引这几种文件
indexFile(writer, f);
}
}
}
private static void indexFile(IndexWriter writer, File f) throws IOException{
if(!f.exists()) return;
Document doc = new Document();
doc.add(Field.Text("content", new FileReader(f))); //索引文件
doc.add(Field.Keyword("path", f.getCanonicalPath())); //记录文件名
writer.addDocument(doc);
System.out.println("Indexing " + f.getCanonicalPath() + " OK");
}
}
执行过这个程序后, lucene 就把相应目录下的文件建立了索引(索引存放在指定目录中)。
package search;
import java.io.File;
import java.util.Date;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Searcher{
public static void main(String [] args) throws Exception{
File indexDir = new File("C:\\index");
String q = "钱学森";
if(!indexDir.exists() || !indexDir.isDirectory()){
System.out.println(indexDir + " does not exist or is not a directory.");
}
search(indexDir, q);
}
public static void search(File indexDir, String q) throws Exception{
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "content", new MMAnalyzer());
long start = new Date().getTime();
Hits hits = is.search(query);
long end = new Date().getTime();
System.out.println("Found" + hits.length() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
for(int i = 0; i < hits.length(); i++){
Document doc = hits.doc(i);
System.out.println(doc.get("path"));
}
}
}
执行过这个程序后,lucene 将会打印出一些搜索结果,非常有意思,呵呵,试试吧!
关于 lucene 还有很多知识,比如索引优化,搜索方式,搜索结果排序(评分),搜索结果关键字高亮显示等等,有兴趣的可以看看这本书《lucene in action》,目前这本书的中文翻译也出版了,
可以去看看哦。
从 http://jakarta.apache.org/lucene/docs/index.html 下载 lucene 开发包, 我用的是 1.9 版的,呵呵,因为 1.x 和2.0 有些方法有异,2.0 版本的找不到详细的说明文档~
下面就是在 lucene 上构建搜索引擎的一些代码:
package search;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class Indexer{
public static void main(String [] args) throws Exception{
File indexDir = new File("C:\\index"); //索引存放要的路径
File dataDir = new File("C:\\arale\\output"); //要索引的文件
long start = new Date().getTime();
int numIndexed = index(indexDir, dataDir); //记录并索引
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
}
public static int index(File indexDir, File dataDir) throws Exception{
if(!dataDir.exists() || !dataDir.isDirectory()){
System.out.println(dataDir + " does not exist or is not a directory.");
}
IndexWriter writer = new IndexWriter(indexDir, new MMAnalyzer(), true);
writer.setUseCompoundFile(false);
indexDirectory(writer, dataDir);
int numIndexed = writer.docCount();
writer.optimize();
writer.close();
return numIndexed;
}
private static void indexDirectory(IndexWriter writer, File dir) throws IOException{
File [] files = dir.listFiles();
for(int i = 0; i < files.length; i++){
File f = files[i];
if(f.isDirectory()){
indexDirectory(writer, f);
}else if( f.getName().endsWith(".htm") ||
f.getName().endsWith(".html") ||
f.getName().endsWith(".txt")){ //暂时只索引这几种文件
indexFile(writer, f);
}
}
}
private static void indexFile(IndexWriter writer, File f) throws IOException{
if(!f.exists()) return;
Document doc = new Document();
doc.add(Field.Text("content", new FileReader(f))); //索引文件
doc.add(Field.Keyword("path", f.getCanonicalPath())); //记录文件名
writer.addDocument(doc);
System.out.println("Indexing " + f.getCanonicalPath() + " OK");
}
}
执行过这个程序后, lucene 就把相应目录下的文件建立了索引(索引存放在指定目录中)。
package search;
import java.io.File;
import java.util.Date;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Searcher{
public static void main(String [] args) throws Exception{
File indexDir = new File("C:\\index");
String q = "钱学森";
if(!indexDir.exists() || !indexDir.isDirectory()){
System.out.println(indexDir + " does not exist or is not a directory.");
}
search(indexDir, q);
}
public static void search(File indexDir, String q) throws Exception{
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "content", new MMAnalyzer());
long start = new Date().getTime();
Hits hits = is.search(query);
long end = new Date().getTime();
System.out.println("Found" + hits.length() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
for(int i = 0; i < hits.length(); i++){
Document doc = hits.doc(i);
System.out.println(doc.get("path"));
}
}
}
执行过这个程序后,lucene 将会打印出一些搜索结果,非常有意思,呵呵,试试吧!
关于 lucene 还有很多知识,比如索引优化,搜索方式,搜索结果排序(评分),搜索结果关键字高亮显示等等,有兴趣的可以看看这本书《lucene in action》,目前这本书的中文翻译也出版了,
可以去看看哦。
相关阅读 更多 +