转自:http://cumtfirefly.iteye.com/blog/543664
lucene3.0已于2009-11-25发布啦,但网上的入门实例都是针对lucene3.0以前的,相对于以前的版本,貌似改动不小。
本人从头开始学习lucene,现在用的是《lucene in action中文版》,结合lucene3.0文档写了个入门实例,可供像我一样直接从lucene3.0开始学习的新手参考!
入门实例:
1.预处理:先把网上下载的一个《三国演义》电子书“三国演义.txt”(可用其他代替,呵呵)切割成多个小文件。
本人从头开始学习lucene,现在用的是《lucene in action中文版》,结合lucene3.0文档写了个入门实例,可供像我一样直接从lucene3.0开始学习的新手参考!
入门实例:
1.预处理:先把网上下载的一个《三国演义》电子书“三国演义.txt”(可用其他代替,呵呵)切割成多个小文件。
- /** ?
- ?*?@author?ht ?
- ?*?预处理 ?
- ?* ?
- ?*/??
- public?class?FilePreprocess?{ ??
- ???public?static?void?main(String[]?arg){ ??
- ????String?outputpath?=?"D:\\test\\small\\";//小文件存放路径 ??
- ????String?filename?=?"D:\\test\\三国演义.txt";//原文件存放路径 ??
- ????if(!new?File(outputpath).exists()){ ??
- ????????new?File(outputpath).mkdirs(); ??
- ????} ??
- ????splitToSmallFiles(new?File(filename),?outputpath); ??
- ???} ??
- /**大文件切割为小的 ?
- ?*?@param?file ?
- ?*?@param?outputpath ?
- ?*/??
- ???public?static?void?splitToSmallFiles(File?file?,String?outputpath){ ??
- ????????int?filePointer?=?0; ??
- ????int?MAX_SIZE?=?10240; ??
- ????String?filename?=?"output"; ??
- ??
- ????BufferedWriter?writer?=?null; ??
- ????try?{ ??
- ????????BufferedReader?reader?=?new?BufferedReader(new?FileReader(file)); ??
- ????????StringBuffer?buffer?=?new?StringBuffer(); ??
- ????????String?line?=?reader.readLine(); ??
- ????????while(line?!=?null){ ??
- ????????????buffer.append(line).append("\r\n"); ??
- ????????????if(buffer.toString().getBytes().length>=MAX_SIZE){ ??
- ????????????????writer?=?new?BufferedWriter(new??FileWriter(outputpath+filename+filePointer+".txt")); ??
- ????????????????writer.write(buffer.toString()); ??
- ????????????????writer.close(); ??
- ????????????????filePointer++; ??
- ????????????????buffer=new?StringBuffer(); ??
- ????????????} ??
- ????????????line?=?reader.readLine();??????????????? ??
- ????????} ??
- ????????writer?=?new?BufferedWriter(new?FileWriter(outputpath+filename+filePointer+".txt")); ??
- ????????writer.write(buffer.toString()); ??
- ????????writer.close(); ??
- ????????System.out.println("The?file?hava?splited?to?small?files?!"); ??
- ????}?catch?(FileNotFoundException?e)?{ ??
- ????????System.out.println("file?not?found?!"); ??
- ????e.printStackTrace(); ??
- ????}?catch?(IOException?e)?{ ??
- ????????e.printStackTrace(); ??
- ????}??????? ??
- }??
/** * @author ht * 预处理 * */ public class FilePreprocess { public static void main(String[] arg){ String outputpath = "D:\\test\\small\\";//小文件存放路径 String filename = "D:\\test\\三国演义.txt";//原文件存放路径 if(!new File(outputpath).exists()){ new File(outputpath).mkdirs(); } splitToSmallFiles(new File(filename), outputpath); } /**大文件切割为小的 * @param file * @param outputpath */ public static void splitToSmallFiles(File file ,String outputpath){ int filePointer = 0; int MAX_SIZE = 10240; String filename = "output"; BufferedWriter writer = null; try { BufferedReader reader = new BufferedReader(new FileReader(file)); StringBuffer buffer = new StringBuffer(); String line = reader.readLine(); while(line != null){ buffer.append(line).append("\r\n"); if(buffer.toString().getBytes().length>=MAX_SIZE){ writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); filePointer++; buffer=new StringBuffer(); } line = reader.readLine(); } writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt")); writer.write(buffer.toString()); writer.close(); System.out.println("The file hava splited to small files !"); } catch (FileNotFoundException e) { System.out.println("file not found !"); e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
2.用lucene3.0生成索引类:用lencene3.0对生成的多个小文件进行索引,中文分词用的是lucene3.0自带的StandardAnalyzer.
- /** ?
- ?*?@author?ht ?
- ?*?索引生成 ?
- ?* ?
- ?*/??
- public?class?Indexer?{ ??
- ???private?static?String?INDEX_DIR?=?"D:\\test\\index";//索引存放目录 ??
- ???private?static?String?DATA_DIR?=?"D:\\test\\small\\";//小文件存放的目录 ??
- ???? ??
- ??public?static?void?main(String[]?args)?throws?Exception?{ ??
- ? ??
- ????long?start?=?new?Date().getTime(); ??
- ????int?numIndexed?=?index(new?File(INDEX_DIR),?new?File(DATA_DIR));//调用index方法 ??
- ????long?end?=?new?Date().getTime(); ??
- ????System.out.println("Indexing?"?+?numIndexed?+?"?files?took?"?+?(end?-?start)?+?"?milliseconds"); ??
- ??} ??
- ??
- ??/**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量 ?
- ?*?@param?indexDir ?
- ?*?@param?dataDir ?
- ?*?@return?int? ?
- ?*?@throws?IOException ?
- ?*/??
- public?static?int?index(File?indexDir,?File?dataDir)?throws?IOException?{ ??
- ??
- ????if?(!dataDir.exists()?||?!dataDir.isDirectory())?{ ??
- ??????throw?new?IOException(dataDir?+?"?does?not?exist?or?is?not?a?directory"); ??
- ????} ??
- ??
- ????IndexWriter?writer?=?new?IndexWriter(FSDirectory.open(indexDir),?new?StandardAnalyzer(Version.LUCENE_CURRENT),?true,? ??
- IndexWriter.MaxFieldLength.LIMITED);//有变化的地方 ??
- ???? ??
- ????indexDirectory(writer,?dataDir); ??
- ????int?numIndexed?=?writer.numDocs(); ??
- ????writer.optimize(); ??
- ????writer.close(); ??
- ????return?numIndexed; ??
- ??} ??
- ??
- ??/**循环遍历目录下的所有.txt文件并进行索引 ?
- ?*?@param?writer ?
- ?*?@param?dir ?
- ?*?@throws?IOException ?
- ?*/??
- private?static?void?indexDirectory(IndexWriter?writer,?File?dir) ??
- ????throws?IOException?{ ??
- ??
- ????File[]?files?=?dir.listFiles(); ??
- ??
- ????for?(int?i?=?0;?i?<?files.length;?i++)?{ ??
- ??????File?f?=?files[i]; ??
- ??????if?(f.isDirectory())?{ ??
- ????????indexDirectory(writer,?f);??//?recurse ??
- ??????}?else?if?(f.getName().endsWith(".txt"))?{ ??
- ????????indexFile(writer,?f); ??
- ??????} ??
- ????} ??
- ??} ??
- ??
- ??/**对单个txt文件进行索引 ?
- ?*?@param?writer ?
- ?*?@param?f ?
- ?*?@throws?IOException ?
- ?*/??
- private?static?void?indexFile(IndexWriter?writer,?File?f) ??
- ????throws?IOException?{ ??
- ???? ??
- ????if?(f.isHidden()?||?!f.exists()?||?!f.canRead())?{ ??
- ??????return; ??
- ????} ??
- ??
- ????System.out.println("Indexing?"?+?f.getCanonicalPath()); ??
- ????Document?doc?=?new?Document(); ??
- ????doc.add(new?Field("contents",new?FileReader(f)));//有变化的地方 ??
- ????doc.add(new?Field("filename",f.getCanonicalPath(),Field.Store.YES,?Field.Index.ANALYZED));//有变化的地方 ??
- ? ??
- ????writer.addDocument(doc); ??
- ??} ??
- }??
/** * @author ht * 索引生成 * */ public class Indexer { private static String INDEX_DIR = "D:\\test\\index";//索引存放目录 private static String DATA_DIR = "D:\\test\\small\\";//小文件存放的目录 public static void main(String[] args) throws Exception { long start = new Date().getTime(); int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法 long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } /**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量 * @param indexDir * @param dataDir * @return int * @throws IOException */ public static int index(File indexDir, File dataDir) throws IOException { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + " does not exist or is not a directory"); } IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED);//有变化的地方 indexDirectory(writer, dataDir); int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } /**循环遍历目录下的所有.txt文件并进行索引 * @param writer * @param dir * @throws IOException */ private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); // recurse } else if (f.getName().endsWith(".txt")) { indexFile(writer, f); } } } /**对单个txt文件进行索引 * @param writer * @param f * @throws IOException */ private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { return; } System.out.println("Indexing " + f.getCanonicalPath()); Document doc = new Document(); doc.add(new Field("contents",new FileReader(f)));//有变化的地方 doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方 writer.addDocument(doc); } }
3.查询类:查询“玄德”!
- /** ?
- ?*?@author?ht ?
- ?*?查询 ?
- ?* ?
- ?*/??
- public?class?Searcher?{ ??
- ???private?static?String?INDEX_DIR?=?"D:\\test\\index\\";//索引所在的路径 ??
- ???private?static?String?KEYWORD?=?"玄德";//关键词 ??
- ???private?static?int?TOP_NUM?=?100;//显示前100条结果 ??
- ???? ??
- ??public?static?void?main(String[]?args)?throws?Exception?{ ??
- ????File?indexDir?=?new?File(INDEX_DIR); ??
- ????if?(!indexDir.exists()?||?!indexDir.isDirectory())?{ ??
- ??????throw?new?Exception(indexDir?+ ??
- ????????"?does?not?exist?or?is?not?a?directory."); ??
- ????} ??
- ????search(indexDir,?KEYWORD);//调用search方法进行查询 ??
- ??} ??
- /**查询 ?
- ?*?@param?indexDir ?
- ?*?@param?q ?
- ?*?@throws?Exception ?
- ?*/??
- ??public?static?void?search(File?indexDir,?String?q)?throws?Exception?{ ??
- ????IndexSearcher?is?=?new??IndexSearcher(FSDirectory.open(indexDir),true);//read-only ??
- ????String?field?=?"contents"; ??
- ???? ??
- ????QueryParser?parser?=?new?QueryParser(Version.LUCENE_CURRENT,?field,?new?StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方 ??
- ????Query?query?=?parser.parse(q); ??
- ??
- ????TopScoreDocCollector?collector?=?TopScoreDocCollector.create(TOP_NUM?,?false);//有变化的地方 ??
- ???? ??
- ????long?start?=?new?Date().getTime();//?start?time ??
- ???? ??
- ????is.search(query,?collector); ??
- ????ScoreDoc[]?hits?=?collector.topDocs().scoreDocs; ??
- ??
- ????System.out.println(hits.length); ??
- ????for?(int?i?=?0;?i?<?hits.length;?i++)?{ ??
- ????????Document?doc?=?is.doc(hits[i].doc);//new?method?is.doc() ??
- ????????System.out.println(doc.getField("filename")+"???"+hits[i].toString()+"??"); ??
- ????} ??
- ????long?end?=?new?Date().getTime();//end?time ??
- ??
- ????System.out.println("Found?"?+?collector.getTotalHits()?+ ??
- ??????????????"?document(s)?(in?"?+?(end?-?start)?+ ??
- ??????????????"?milliseconds)?that?matched?query?'"?+ ??
- ????????????????q?+?"':"); ??
- ??} ??
- }??
/** * @author ht * 查询 * */ public class Searcher { private static String INDEX_DIR = "D:\\test\\index\\";//索引所在的路径 private static String KEYWORD = "玄德";//关键词 private static int TOP_NUM = 100;//显示前100条结果 public static void main(String[] args) throws Exception { File indexDir = new File(INDEX_DIR); if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + " does not exist or is not a directory."); } search(indexDir, KEYWORD);//调用search方法进行查询 } /**查询 * @param indexDir * @param q * @throws Exception */ public static void search(File indexDir, String q) throws Exception { IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir),true);//read-only String field = "contents"; QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方 Query query = parser.parse(q); TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方 long start = new Date().getTime();// start time is.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println(hits.length); for (int i = 0; i < hits.length; i++) { Document doc = is.doc(hits[i].doc);//new method is.doc() System.out.println(doc.getField("filename")+" "+hits[i].toString()+" "); } long end = new Date().getTime();//end time System.out.println("Found " + collector.getTotalHits() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); } }
4.结果就不贴啦,反正能运行就是啦
1 楼
libingyang
2010-05-11
很好很强大