当前位置: 代码迷 >> Web前端 >> 【转】lucene3.0入门范例
  详细解决方案

【转】lucene3.0入门范例

热度:487   发布时间:2012-11-23 00:03:29.0
【转】lucene3.0入门实例
转自:http://cumtfirefly.iteye.com/blog/543664
lucene3.0已于2009-11-25发布啦,但网上的入门实例都是针对lucene3.0以前的,相对于以前的版本,貌似改动不小。
本人从头开始学习lucene,现在用的是《lucene in action中文版》,结合lucene3.0文档写了个入门实例,可供像我一样直接从lucene3.0开始学习的新手参考!


入门实例:

1.预处理:先把网上下载的一个《三国演义》电子书“三国演义.txt”(可用其他代替,呵呵)切割成多个小文件。

Java代码 复制代码
  1. /** ?
  2. ?*?@author?ht ?
  3. ?*?预处理 ?
  4. ?* ?
  5. ?*/??
  6. public?class?FilePreprocess?{ ??
  7. ???public?static?void?main(String[]?arg){ ??
  8. ????String?outputpath?=?"D:\\test\\small\\";//小文件存放路径 ??
  9. ????String?filename?=?"D:\\test\\三国演义.txt";//原文件存放路径 ??
  10. ????if(!new?File(outputpath).exists()){ ??
  11. ????????new?File(outputpath).mkdirs(); ??
  12. ????} ??
  13. ????splitToSmallFiles(new?File(filename),?outputpath); ??
  14. ???} ??
  15. /**大文件切割为小的 ?
  16. ?*?@param?file ?
  17. ?*?@param?outputpath ?
  18. ?*/??
  19. ???public?static?void?splitToSmallFiles(File?file?,String?outputpath){ ??
  20. ????????int?filePointer?=?0; ??
  21. ????int?MAX_SIZE?=?10240; ??
  22. ????String?filename?=?"output"; ??
  23. ??
  24. ????BufferedWriter?writer?=?null; ??
  25. ????try?{ ??
  26. ????????BufferedReader?reader?=?new?BufferedReader(new?FileReader(file)); ??
  27. ????????StringBuffer?buffer?=?new?StringBuffer(); ??
  28. ????????String?line?=?reader.readLine(); ??
  29. ????????while(line?!=?null){ ??
  30. ????????????buffer.append(line).append("\r\n"); ??
  31. ????????????if(buffer.toString().getBytes().length>=MAX_SIZE){ ??
  32. ????????????????writer?=?new?BufferedWriter(new??FileWriter(outputpath+filename+filePointer+".txt")); ??
  33. ????????????????writer.write(buffer.toString()); ??
  34. ????????????????writer.close(); ??
  35. ????????????????filePointer++; ??
  36. ????????????????buffer=new?StringBuffer(); ??
  37. ????????????} ??
  38. ????????????line?=?reader.readLine();??????????????? ??
  39. ????????} ??
  40. ????????writer?=?new?BufferedWriter(new?FileWriter(outputpath+filename+filePointer+".txt")); ??
  41. ????????writer.write(buffer.toString()); ??
  42. ????????writer.close(); ??
  43. ????????System.out.println("The?file?hava?splited?to?small?files?!"); ??
  44. ????}?catch?(FileNotFoundException?e)?{ ??
  45. ????????System.out.println("file?not?found?!"); ??
  46. ????e.printStackTrace(); ??
  47. ????}?catch?(IOException?e)?{ ??
  48. ????????e.printStackTrace(); ??
  49. ????}??????? ??
  50. }??
/**
 * @author ht
 * 预处理
 *
 */
public class FilePreprocess {
   public static void main(String[] arg){
	String outputpath = "D:\\test\\small\\";//小文件存放路径
	String filename = "D:\\test\\三国演义.txt";//原文件存放路径
	if(!new File(outputpath).exists()){
		new File(outputpath).mkdirs();
	}
	splitToSmallFiles(new File(filename), outputpath);
   }
/**大文件切割为小的
 * @param file
 * @param outputpath
 */
   public static void splitToSmallFiles(File file ,String outputpath){
        int filePointer = 0;
	int MAX_SIZE = 10240;
	String filename = "output";

	BufferedWriter writer = null;
	try {
		BufferedReader reader = new BufferedReader(new FileReader(file));
		StringBuffer buffer = new StringBuffer();
		String line = reader.readLine();
		while(line != null){
			buffer.append(line).append("\r\n");
			if(buffer.toString().getBytes().length>=MAX_SIZE){
				writer = new BufferedWriter(new  FileWriter(outputpath+filename+filePointer+".txt"));
				writer.write(buffer.toString());
				writer.close();
				filePointer++;
				buffer=new StringBuffer();
			}
			line = reader.readLine();				
		}
		writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt"));
		writer.write(buffer.toString());
		writer.close();
		System.out.println("The file hava splited to small files !");
	} catch (FileNotFoundException e) {
		System.out.println("file not found !");
	e.printStackTrace();
	} catch (IOException e) {
		e.printStackTrace();
	}		
}


2.用lucene3.0生成索引类:用lencene3.0对生成的多个小文件进行索引,中文分词用的是lucene3.0自带的StandardAnalyzer.

Java代码 复制代码
  1. /** ?
  2. ?*?@author?ht ?
  3. ?*?索引生成 ?
  4. ?* ?
  5. ?*/??
  6. public?class?Indexer?{ ??
  7. ???private?static?String?INDEX_DIR?=?"D:\\test\\index";//索引存放目录 ??
  8. ???private?static?String?DATA_DIR?=?"D:\\test\\small\\";//小文件存放的目录 ??
  9. ???? ??
  10. ??public?static?void?main(String[]?args)?throws?Exception?{ ??
  11. ? ??
  12. ????long?start?=?new?Date().getTime(); ??
  13. ????int?numIndexed?=?index(new?File(INDEX_DIR),?new?File(DATA_DIR));//调用index方法 ??
  14. ????long?end?=?new?Date().getTime(); ??
  15. ????System.out.println("Indexing?"?+?numIndexed?+?"?files?took?"?+?(end?-?start)?+?"?milliseconds"); ??
  16. ??} ??
  17. ??
  18. ??/**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量 ?
  19. ?*?@param?indexDir ?
  20. ?*?@param?dataDir ?
  21. ?*?@return?int? ?
  22. ?*?@throws?IOException ?
  23. ?*/??
  24. public?static?int?index(File?indexDir,?File?dataDir)?throws?IOException?{ ??
  25. ??
  26. ????if?(!dataDir.exists()?||?!dataDir.isDirectory())?{ ??
  27. ??????throw?new?IOException(dataDir?+?"?does?not?exist?or?is?not?a?directory"); ??
  28. ????} ??
  29. ??
  30. ????IndexWriter?writer?=?new?IndexWriter(FSDirectory.open(indexDir),?new?StandardAnalyzer(Version.LUCENE_CURRENT),?true,? ??
  31. IndexWriter.MaxFieldLength.LIMITED);//有变化的地方 ??
  32. ???? ??
  33. ????indexDirectory(writer,?dataDir); ??
  34. ????int?numIndexed?=?writer.numDocs(); ??
  35. ????writer.optimize(); ??
  36. ????writer.close(); ??
  37. ????return?numIndexed; ??
  38. ??} ??
  39. ??
  40. ??/**循环遍历目录下的所有.txt文件并进行索引 ?
  41. ?*?@param?writer ?
  42. ?*?@param?dir ?
  43. ?*?@throws?IOException ?
  44. ?*/??
  45. private?static?void?indexDirectory(IndexWriter?writer,?File?dir) ??
  46. ????throws?IOException?{ ??
  47. ??
  48. ????File[]?files?=?dir.listFiles(); ??
  49. ??
  50. ????for?(int?i?=?0;?i?<?files.length;?i++)?{ ??
  51. ??????File?f?=?files[i]; ??
  52. ??????if?(f.isDirectory())?{ ??
  53. ????????indexDirectory(writer,?f);??//?recurse ??
  54. ??????}?else?if?(f.getName().endsWith(".txt"))?{ ??
  55. ????????indexFile(writer,?f); ??
  56. ??????} ??
  57. ????} ??
  58. ??} ??
  59. ??
  60. ??/**对单个txt文件进行索引 ?
  61. ?*?@param?writer ?
  62. ?*?@param?f ?
  63. ?*?@throws?IOException ?
  64. ?*/??
  65. private?static?void?indexFile(IndexWriter?writer,?File?f) ??
  66. ????throws?IOException?{ ??
  67. ???? ??
  68. ????if?(f.isHidden()?||?!f.exists()?||?!f.canRead())?{ ??
  69. ??????return; ??
  70. ????} ??
  71. ??
  72. ????System.out.println("Indexing?"?+?f.getCanonicalPath()); ??
  73. ????Document?doc?=?new?Document(); ??
  74. ????doc.add(new?Field("contents",new?FileReader(f)));//有变化的地方 ??
  75. ????doc.add(new?Field("filename",f.getCanonicalPath(),Field.Store.YES,?Field.Index.ANALYZED));//有变化的地方 ??
  76. ? ??
  77. ????writer.addDocument(doc); ??
  78. ??} ??
  79. }??
/**
 * @author ht
 * 索引生成
 *
 */
public class Indexer {
   private static String INDEX_DIR = "D:\\test\\index";//索引存放目录
   private static String DATA_DIR = "D:\\test\\small\\";//小文件存放的目录
	
  public static void main(String[] args) throws Exception {
 
    long start = new Date().getTime();
    int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法
    long end = new Date().getTime();
    System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
  }

  /**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量
 * @param indexDir
 * @param dataDir
 * @return int 
 * @throws IOException
 */
public static int index(File indexDir, File dataDir) throws IOException {

    if (!dataDir.exists() || !dataDir.isDirectory()) {
      throw new IOException(dataDir + " does not exist or is not a directory");
    }

    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, 
IndexWriter.MaxFieldLength.LIMITED);//有变化的地方
    
    indexDirectory(writer, dataDir);
    int numIndexed = writer.numDocs();
    writer.optimize();
    writer.close();
    return numIndexed;
  }

  /**循环遍历目录下的所有.txt文件并进行索引
 * @param writer
 * @param dir
 * @throws IOException
 */
private static void indexDirectory(IndexWriter writer, File dir)
    throws IOException {

    File[] files = dir.listFiles();

    for (int i = 0; i < files.length; i++) {
      File f = files[i];
      if (f.isDirectory()) {
        indexDirectory(writer, f);  // recurse
      } else if (f.getName().endsWith(".txt")) {
        indexFile(writer, f);
      }
    }
  }

  /**对单个txt文件进行索引
 * @param writer
 * @param f
 * @throws IOException
 */
private static void indexFile(IndexWriter writer, File f)
    throws IOException {
	
    if (f.isHidden() || !f.exists() || !f.canRead()) {
      return;
    }

    System.out.println("Indexing " + f.getCanonicalPath());
    Document doc = new Document();
    doc.add(new Field("contents",new FileReader(f)));//有变化的地方
    doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方
 
    writer.addDocument(doc);
  }
}


3.查询类:查询“玄德”!

Java代码 复制代码
  1. /** ?
  2. ?*?@author?ht ?
  3. ?*?查询 ?
  4. ?* ?
  5. ?*/??
  6. public?class?Searcher?{ ??
  7. ???private?static?String?INDEX_DIR?=?"D:\\test\\index\\";//索引所在的路径 ??
  8. ???private?static?String?KEYWORD?=?"玄德";//关键词 ??
  9. ???private?static?int?TOP_NUM?=?100;//显示前100条结果 ??
  10. ???? ??
  11. ??public?static?void?main(String[]?args)?throws?Exception?{ ??
  12. ????File?indexDir?=?new?File(INDEX_DIR); ??
  13. ????if?(!indexDir.exists()?||?!indexDir.isDirectory())?{ ??
  14. ??????throw?new?Exception(indexDir?+ ??
  15. ????????"?does?not?exist?or?is?not?a?directory."); ??
  16. ????} ??
  17. ????search(indexDir,?KEYWORD);//调用search方法进行查询 ??
  18. ??} ??
  19. /**查询 ?
  20. ?*?@param?indexDir ?
  21. ?*?@param?q ?
  22. ?*?@throws?Exception ?
  23. ?*/??
  24. ??public?static?void?search(File?indexDir,?String?q)?throws?Exception?{ ??
  25. ????IndexSearcher?is?=?new??IndexSearcher(FSDirectory.open(indexDir),true);//read-only ??
  26. ????String?field?=?"contents"; ??
  27. ???? ??
  28. ????QueryParser?parser?=?new?QueryParser(Version.LUCENE_CURRENT,?field,?new?StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方 ??
  29. ????Query?query?=?parser.parse(q); ??
  30. ??
  31. ????TopScoreDocCollector?collector?=?TopScoreDocCollector.create(TOP_NUM?,?false);//有变化的地方 ??
  32. ???? ??
  33. ????long?start?=?new?Date().getTime();//?start?time ??
  34. ???? ??
  35. ????is.search(query,?collector); ??
  36. ????ScoreDoc[]?hits?=?collector.topDocs().scoreDocs; ??
  37. ??
  38. ????System.out.println(hits.length); ??
  39. ????for?(int?i?=?0;?i?<?hits.length;?i++)?{ ??
  40. ????????Document?doc?=?is.doc(hits[i].doc);//new?method?is.doc() ??
  41. ????????System.out.println(doc.getField("filename")+"???"+hits[i].toString()+"??"); ??
  42. ????} ??
  43. ????long?end?=?new?Date().getTime();//end?time ??
  44. ??
  45. ????System.out.println("Found?"?+?collector.getTotalHits()?+ ??
  46. ??????????????"?document(s)?(in?"?+?(end?-?start)?+ ??
  47. ??????????????"?milliseconds)?that?matched?query?'"?+ ??
  48. ????????????????q?+?"':"); ??
  49. ??} ??
  50. }??
/**
 * @author ht
 * 查询
 *
 */
public class Searcher {
   private static String INDEX_DIR = "D:\\test\\index\\";//索引所在的路径
   private static String KEYWORD = "玄德";//关键词
   private static int TOP_NUM = 100;//显示前100条结果
	
  public static void main(String[] args) throws Exception {
	File indexDir = new File(INDEX_DIR);
    if (!indexDir.exists() || !indexDir.isDirectory()) {
      throw new Exception(indexDir +
        " does not exist or is not a directory.");
    }
    search(indexDir, KEYWORD);//调用search方法进行查询
  }
/**查询
 * @param indexDir
 * @param q
 * @throws Exception
 */
  public static void search(File indexDir, String q) throws Exception {
    IndexSearcher is = new  IndexSearcher(FSDirectory.open(indexDir),true);//read-only
    String field = "contents";
    
    QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方
    Query query = parser.parse(q);

    TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方
    
    long start = new Date().getTime();// start time
    
    is.search(query, collector);
    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    System.out.println(hits.length);
    for (int i = 0; i < hits.length; i++) {
    	Document doc = is.doc(hits[i].doc);//new method is.doc()
    	System.out.println(doc.getField("filename")+"   "+hits[i].toString()+"  ");
	}
    long end = new Date().getTime();//end time

    System.out.println("Found " + collector.getTotalHits() +
    	      " document(s) (in " + (end - start) +
    	      " milliseconds) that matched query '" +
    	        q + "':");
  }
}


4.结果就不贴啦,反正能运行就是啦
  • lucene3.0入门实例源码.rar (3 KB)
  • 下载次数: 146
1 楼 libingyang 2010-05-11  
很好很强大