Lucene全文检索引擎
Posted Jansens
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Lucene全文检索引擎相关的知识,希望对你有一定的参考价值。
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>demo.lucene</groupId> <artifactId>Lucene01</artifactId> <version>0.0.1-SNAPSHOT</version> <build/> <dependencies> <!-- lucene核心包 --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>5.3.1</version> </dependency> <!-- lucene查询解析包 --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>5.3.1</version> </dependency> <!-- lucene解析器包 --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>5.3.1</version> </dependency> </dependencies> </project>
import java.io.File; import java.io.FileReader; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * 建立索引的类 * @author Ni Shengwu * */ public class Indexer { private IndexWriter writer; //写索引实例 //构造方法,实例化IndexWriter public Indexer(String indexDir) throws Exception { Directory dir = FSDirectory.open(Paths.get(indexDir)); Analyzer analyzer = new StandardAnalyzer(); //标准分词器,会自动去掉空格啊,is a the等单词 IndexWriterConfig config = new IndexWriterConfig(analyzer); //将标准分词器配到写索引的配置中 writer = new IndexWriter(dir, config); //实例化写索引对象 } //关闭写索引 public void close() throws Exception { writer.close(); } //索引指定目录下的所有文件 public int indexAll(String dataDir) throws Exception { File[] files = new File(dataDir).listFiles(); //获取该路径下的所有文件 for(File file : files) { indexFile(file); //调用下面的indexFile方法,对每个文件进行索引 } return writer.numDocs(); //返回索引的文件数 } //索引指定的文件 private void indexFile(File file) throws Exception { System.out.println("索引文件的路径:" + file.getCanonicalPath()); Document doc = getDocument(file); //获取该文件的document writer.addDocument(doc); //调用下面的getDocument方法,将doc添加到索引中 } //获取文档,文档里再设置每个字段,就类似于数据库中的一行记录 private Document getDocument(File file) throws Exception{ Document doc = new Document(); //添加字段 doc.add(new TextField("contents", new FileReader(file))); //添加内容 doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里 doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径 return doc; } public static void main(String[] args) { String indexDir = "D:\\lucene"; //将索引保存到的路径 String dataDir = "D:\\lucene\\data"; //需要索引的文件数据存放的目录 Indexer indexer = null; int indexedNum = 0; long startTime = System.currentTimeMillis(); //记录索引开始时间 try { indexer = new Indexer(indexDir); indexedNum = indexer.indexAll(dataDir); } catch (Exception e) { e.printStackTrace(); } finally { try { indexer.close(); } catch (Exception e) { e.printStackTrace(); } } long endTime = System.currentTimeMillis(); //记录索引结束时间 System.out.println("索引耗时" + (endTime-startTime) + "毫秒"); System.out.println("共索引了" + indexedNum + "个文件"); } }
import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searcher { public static void search(String indexDir, String q) throws Exception { Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径,也就是索引所在的位置 IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); //标准分词器,会自动去掉空格啊,is a the等单词 QueryParser parser = new QueryParser("contents", analyzer); //查询解析器 Query query = parser.parse(q); //通过解析要查询的String,获取查询对象 long startTime = System.currentTimeMillis(); //记录索引开始时间 TopDocs docs = searcher.search(query, 10);//开始查询,查询前10条数据,将记录保存在docs中 long endTime = System.currentTimeMillis(); //记录索引结束时间 System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒"); System.out.println("查询到" + docs.totalHits + "条记录"); for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果 Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档 System.out.println(doc.get("fullPath")); //fullPath是刚刚建立索引的时候我们定义的一个字段 } reader.close(); } public static void main(String[] args) { String indexDir = "D:\\lucene"; String q = "generate-maven-artifacts"; //查询这个字符串 try { search(indexDir, q); } catch (Exception e) { e.printStackTrace(); } } }
pom.xml
以上是关于Lucene全文检索引擎的主要内容,如果未能解决你的问题,请参考以下文章