【转】lucene3.0入门范例 _Web前端

【转】lucene3.0入门实例

转自：http://cumtfirefly.iteye.com/blog/543664

lucene3.0已于2009-11-25发布啦，但网上的入门实例都是针对lucene3.0以前的，相对于以前的版本，貌似改动不小。
本人从头开始学习lucene，现在用的是《lucene in action中文版》，结合lucene3.0文档写了个入门实例，可供像我一样直接从lucene3.0开始学习的新手参考！

入门实例：

1.预处理：先把网上下载的一个《三国演义》电子书“三国演义.txt”（可用其他代替，呵呵）切割成多个小文件。

Java代码

/** ?
?*?@author?ht ?
?*?预处理 ?
?* ?
?*/??
public?class?FilePreprocess?{ ??
???public?static?void?main(String[]?arg){ ??
????String?outputpath?=?"D:\\test\\small\\";//小文件存放路径 ??
????String?filename?=?"D:\\test\\三国演义.txt";//原文件存放路径 ??
????if(!new?File(outputpath).exists()){ ??
????????new?File(outputpath).mkdirs(); ??
????} ??
????splitToSmallFiles(new?File(filename),?outputpath); ??
???} ??
/**大文件切割为小的 ?
?*?@param?file ?
?*?@param?outputpath ?
?*/??
???public?static?void?splitToSmallFiles(File?file?,String?outputpath){ ??
????????int?filePointer?=?0; ??
????int?MAX_SIZE?=?10240; ??
????String?filename?=?"output"; ??
??
????BufferedWriter?writer?=?null; ??
????try?{ ??
????????BufferedReader?reader?=?new?BufferedReader(new?FileReader(file)); ??
????????StringBuffer?buffer?=?new?StringBuffer(); ??
????????String?line?=?reader.readLine(); ??
????????while(line?!=?null){ ??
????????????buffer.append(line).append("\r\n"); ??
????????????if(buffer.toString().getBytes().length>=MAX_SIZE){ ??
????????????????writer?=?new?BufferedWriter(new??FileWriter(outputpath+filename+filePointer+".txt")); ??
????????????????writer.write(buffer.toString()); ??
????????????????writer.close(); ??
????????????????filePointer++; ??
????????????????buffer=new?StringBuffer(); ??
????????????} ??
????????????line?=?reader.readLine();??????????????? ??
????????} ??
????????writer?=?new?BufferedWriter(new?FileWriter(outputpath+filename+filePointer+".txt")); ??
????????writer.write(buffer.toString()); ??
????????writer.close(); ??
????????System.out.println("The?file?hava?splited?to?small?files?!"); ??
????}?catch?(FileNotFoundException?e)?{ ??
????????System.out.println("file?not?found?!"); ??
????e.printStackTrace(); ??
????}?catch?(IOException?e)?{ ??
????????e.printStackTrace(); ??
????}??????? ??
}??

/**
 * @author ht
 * 预处理
 *
 */
public class FilePreprocess {
   public static void main(String[] arg){
	String outputpath = "D:\\test\\small\\";//小文件存放路径
	String filename = "D:\\test\\三国演义.txt";//原文件存放路径
	if(!new File(outputpath).exists()){
		new File(outputpath).mkdirs();
	}
	splitToSmallFiles(new File(filename), outputpath);
   }
/**大文件切割为小的
 * @param file
 * @param outputpath
 */
   public static void splitToSmallFiles(File file ,String outputpath){
        int filePointer = 0;
	int MAX_SIZE = 10240;
	String filename = "output";

	BufferedWriter writer = null;
	try {
		BufferedReader reader = new BufferedReader(new FileReader(file));
		StringBuffer buffer = new StringBuffer();
		String line = reader.readLine();
		while(line != null){
			buffer.append(line).append("\r\n");
			if(buffer.toString().getBytes().length>=MAX_SIZE){
				writer = new BufferedWriter(new  FileWriter(outputpath+filename+filePointer+".txt"));
				writer.write(buffer.toString());
				writer.close();
				filePointer++;
				buffer=new StringBuffer();
			}
			line = reader.readLine();				
		}
		writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt"));
		writer.write(buffer.toString());
		writer.close();
		System.out.println("The file hava splited to small files !");
	} catch (FileNotFoundException e) {
		System.out.println("file not found !");
	e.printStackTrace();
	} catch (IOException e) {
		e.printStackTrace();
	}		
}

2.用lucene3.0生成索引类:用lencene3.0对生成的多个小文件进行索引，中文分词用的是lucene3.0自带的StandardAnalyzer.

Java代码

/** ?
?*?@author?ht ?
?*?索引生成 ?
?* ?
?*/??
public?class?Indexer?{ ??
???private?static?String?INDEX_DIR?=?"D:\\test\\index";//索引存放目录 ??
???private?static?String?DATA_DIR?=?"D:\\test\\small\\";//小文件存放的目录 ??
???? ??
??public?static?void?main(String[]?args)?throws?Exception?{ ??
? ??
????long?start?=?new?Date().getTime(); ??
????int?numIndexed?=?index(new?File(INDEX_DIR),?new?File(DATA_DIR));//调用index方法 ??
????long?end?=?new?Date().getTime(); ??
????System.out.println("Indexing?"?+?numIndexed?+?"?files?took?"?+?(end?-?start)?+?"?milliseconds"); ??
??} ??
??
??/**索引dataDir下的.txt文件，并储存在indexDir下，返回索引的文件数量 ?
?*?@param?indexDir ?
?*?@param?dataDir ?
?*?@return?int? ?
?*?@throws?IOException ?
?*/??
public?static?int?index(File?indexDir,?File?dataDir)?throws?IOException?{ ??
??
????if?(!dataDir.exists()?||?!dataDir.isDirectory())?{ ??
??????throw?new?IOException(dataDir?+?"?does?not?exist?or?is?not?a?directory"); ??
????} ??
??
????IndexWriter?writer?=?new?IndexWriter(FSDirectory.open(indexDir),?new?StandardAnalyzer(Version.LUCENE_CURRENT),?true,? ??
IndexWriter.MaxFieldLength.LIMITED);//有变化的地方 ??
???? ??
????indexDirectory(writer,?dataDir); ??
????int?numIndexed?=?writer.numDocs(); ??
????writer.optimize(); ??
????writer.close(); ??
????return?numIndexed; ??
??} ??
??
??/**循环遍历目录下的所有.txt文件并进行索引 ?
?*?@param?writer ?
?*?@param?dir ?
?*?@throws?IOException ?
?*/??
private?static?void?indexDirectory(IndexWriter?writer,?File?dir) ??
????throws?IOException?{ ??
??
????File[]?files?=?dir.listFiles(); ??
??
????for?(int?i?=?0;?i?<?files.length;?i++)?{ ??
??????File?f?=?files[i]; ??
??????if?(f.isDirectory())?{ ??
????????indexDirectory(writer,?f);??//?recurse ??
??????}?else?if?(f.getName().endsWith(".txt"))?{ ??
????????indexFile(writer,?f); ??
??????} ??
????} ??
??} ??
??
??/**对单个txt文件进行索引 ?
?*?@param?writer ?
?*?@param?f ?
?*?@throws?IOException ?
?*/??
private?static?void?indexFile(IndexWriter?writer,?File?f) ??
????throws?IOException?{ ??
???? ??
????if?(f.isHidden()?||?!f.exists()?||?!f.canRead())?{ ??
??????return; ??
????} ??
??
????System.out.println("Indexing?"?+?f.getCanonicalPath()); ??
????Document?doc?=?new?Document(); ??
????doc.add(new?Field("contents",new?FileReader(f)));//有变化的地方 ??
????doc.add(new?Field("filename",f.getCanonicalPath(),Field.Store.YES,?Field.Index.ANALYZED));//有变化的地方 ??
? ??
????writer.addDocument(doc); ??
??} ??
}??

/**
 * @author ht
 * 索引生成
 *
 */
public class Indexer {
   private static String INDEX_DIR = "D:\\test\\index";//索引存放目录
   private static String DATA_DIR = "D:\\test\\small\\";//小文件存放的目录
	
  public static void main(String[] args) throws Exception {
 
    long start = new Date().getTime();
    int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法
    long end = new Date().getTime();
    System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
  }

  /**索引dataDir下的.txt文件，并储存在indexDir下，返回索引的文件数量
 * @param indexDir
 * @param dataDir
 * @return int 
 * @throws IOException
 */
public static int index(File indexDir, File dataDir) throws IOException {

    if (!dataDir.exists() || !dataDir.isDirectory()) {
      throw new IOException(dataDir + " does not exist or is not a directory");
    }

    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, 
IndexWriter.MaxFieldLength.LIMITED);//有变化的地方
    
    indexDirectory(writer, dataDir);
    int numIndexed = writer.numDocs();
    writer.optimize();
    writer.close();
    return numIndexed;
  }

  /**循环遍历目录下的所有.txt文件并进行索引
 * @param writer
 * @param dir
 * @throws IOException
 */
private static void indexDirectory(IndexWriter writer, File dir)
    throws IOException {

    File[] files = dir.listFiles();

    for (int i = 0; i < files.length; i++) {
      File f = files[i];
      if (f.isDirectory()) {
        indexDirectory(writer, f);  // recurse
      } else if (f.getName().endsWith(".txt")) {
        indexFile(writer, f);
      }
    }
  }

  /**对单个txt文件进行索引
 * @param writer
 * @param f
 * @throws IOException
 */
private static void indexFile(IndexWriter writer, File f)
    throws IOException {
	
    if (f.isHidden() || !f.exists() || !f.canRead()) {
      return;
    }

    System.out.println("Indexing " + f.getCanonicalPath());
    Document doc = new Document();
    doc.add(new Field("contents",new FileReader(f)));//有变化的地方
    doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方
 
    writer.addDocument(doc);
  }
}

3.查询类：查询“玄德”！

Java代码

/** ?
?*?@author?ht ?
?*?查询 ?
?* ?
?*/??
public?class?Searcher?{ ??
???private?static?String?INDEX_DIR?=?"D:\\test\\index\\";//索引所在的路径 ??
???private?static?String?KEYWORD?=?"玄德";//关键词 ??
???private?static?int?TOP_NUM?=?100;//显示前100条结果 ??
???? ??
??public?static?void?main(String[]?args)?throws?Exception?{ ??
????File?indexDir?=?new?File(INDEX_DIR); ??
????if?(!indexDir.exists()?||?!indexDir.isDirectory())?{ ??
??????throw?new?Exception(indexDir?+ ??
????????"?does?not?exist?or?is?not?a?directory."); ??
????} ??
????search(indexDir,?KEYWORD);//调用search方法进行查询 ??
??} ??
/**查询 ?
?*?@param?indexDir ?
?*?@param?q ?
?*?@throws?Exception ?
?*/??
??public?static?void?search(File?indexDir,?String?q)?throws?Exception?{ ??
????IndexSearcher?is?=?new??IndexSearcher(FSDirectory.open(indexDir),true);//read-only ??
????String?field?=?"contents"; ??
???? ??
????QueryParser?parser?=?new?QueryParser(Version.LUCENE_CURRENT,?field,?new?StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方 ??
????Query?query?=?parser.parse(q); ??
??
????TopScoreDocCollector?collector?=?TopScoreDocCollector.create(TOP_NUM?,?false);//有变化的地方 ??
???? ??
????long?start?=?new?Date().getTime();//?start?time ??
???? ??
????is.search(query,?collector); ??
????ScoreDoc[]?hits?=?collector.topDocs().scoreDocs; ??
??
????System.out.println(hits.length); ??
????for?(int?i?=?0;?i?<?hits.length;?i++)?{ ??
????????Document?doc?=?is.doc(hits[i].doc);//new?method?is.doc() ??
????????System.out.println(doc.getField("filename")+"???"+hits[i].toString()+"??"); ??
????} ??
????long?end?=?new?Date().getTime();//end?time ??
??
????System.out.println("Found?"?+?collector.getTotalHits()?+ ??
??????????????"?document(s)?(in?"?+?(end?-?start)?+ ??
??????????????"?milliseconds)?that?matched?query?'"?+ ??
????????????????q?+?"':"); ??
??} ??
}??

/**
 * @author ht
 * 查询
 *
 */
public class Searcher {
   private static String INDEX_DIR = "D:\\test\\index\\";//索引所在的路径
   private static String KEYWORD = "玄德";//关键词
   private static int TOP_NUM = 100;//显示前100条结果
	
  public static void main(String[] args) throws Exception {
	File indexDir = new File(INDEX_DIR);
    if (!indexDir.exists() || !indexDir.isDirectory()) {
      throw new Exception(indexDir +
        " does not exist or is not a directory.");
    }
    search(indexDir, KEYWORD);//调用search方法进行查询
  }
/**查询
 * @param indexDir
 * @param q
 * @throws Exception
 */
  public static void search(File indexDir, String q) throws Exception {
    IndexSearcher is = new  IndexSearcher(FSDirectory.open(indexDir),true);//read-only
    String field = "contents";
    
    QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方
    Query query = parser.parse(q);

    TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方
    
    long start = new Date().getTime();// start time
    
    is.search(query, collector);
    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    System.out.println(hits.length);
    for (int i = 0; i < hits.length; i++) {
    	Document doc = is.doc(hits[i].doc);//new method is.doc()
    	System.out.println(doc.getField("filename")+"   "+hits[i].toString()+"  ");
	}
    long end = new Date().getTime();//end time

    System.out.println("Found " + collector.getTotalHits() +
    	      " document(s) (in " + (end - start) +
    	      " milliseconds) that matched query '" +
    	        q + "':");
  }
}

4.结果就不贴啦，反正能运行就是啦

lucene3.0入门实例源码.rar (3 KB)
下载次数: 146

1 楼 libingyang 2010-05-11

很好很强大