Lucene全文检索
Posted guanghe
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Lucene全文检索相关的知识,希望对你有一定的参考价值。
POM.xml
1 <!--Lucene全文检索-->
2 <dependency>
3 <groupId>org.apache.lucene</groupId>
4 <artifactId>lucene-core</artifactId>
5 <version>${lucene.version}</version>
6 </dependency>
7 <dependency>
8 <groupId>org.apache.lucene</groupId>
9 <artifactId>lucene-queryparser</artifactId>
10 <version>${lucene.version}</version>
11 </dependency>
12 <dependency>
13 <groupId>org.apache.lucene</groupId>
14 <artifactId>lucene-analyzers-common</artifactId>
15 <version>${lucene.version}</version>
16 </dependency>
17 <dependency>
18 <groupId>org.apache.lucene</groupId>
19 <artifactId>lucene-highlighter</artifactId>
20 <version>${lucene.version}</version>
21 </dependency>
22 <!--中文分词器-->
23 <dependency>
24 <groupId>org.apache.lucene</groupId>
25 <artifactId>lucene-analyzers-smartcn</artifactId>
26 <version>${lucene.version}</version>
27 </dependency>
LuceneUtil.java
1 package io.guangsoft.erp.util;
2
3 import org.apache.lucene.analysis.Analyzer;
4 import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
5 import org.apache.lucene.document.Document;
6 import org.apache.lucene.index.*;
7 import org.apache.lucene.search.IndexSearcher;
8 import org.apache.lucene.search.Query;
9 import org.apache.lucene.search.ScoreDoc;
10 import org.apache.lucene.search.TopDocs;
11 import org.apache.lucene.search.highlight.*;
12 import org.apache.lucene.store.Directory;
13 import org.apache.lucene.store.FSDirectory;
14
15 import java.nio.file.Paths;
16 import java.util.List;
17
18 public class LuceneUtil {
19 //索引目录位置
20 private static final String INDEX_DIR = "/index";
21 //索引文件存放目录对象
22 private static Directory directory;
23 //分词器对象
24 private static Analyzer analyzer;
25 //索引写对象,线程安全
26 private static IndexWriter indexWriter;
27 //索引读对象,线程安全
28 private static IndexReader indexReader;
29 //索引搜索对象,线程安全
30 private static IndexSearcher indexSearcher;
31
32 static {
33 try {
34 directory = FSDirectory.open(Paths.get(INDEX_DIR));
35 //系统关闭前关闭索引库的流
36 Runtime.getRuntime().addShutdownHook(new Thread() {
37 @Override
38 public void run() {
39 try {
40 if(indexWriter != null) {
41 indexWriter.close();
42 }
43 if(indexReader != null) {
44 indexReader.close();
45 }
46 if(directory != null) {
47 directory.close();
48 }
49 } catch (Exception e) {
50 e.printStackTrace();
51 }
52 }
53 });
54 } catch (Exception e) {
55 e.printStackTrace();
56 }
57 }
58
59 //获取分词器
60 public static Analyzer getAnalyzer() {
61 if(analyzer == null) {
62 analyzer = new SmartChineseAnalyzer();
63 }
64 return analyzer;
65 }
66
67 //获取索引Writer
68 public static IndexWriter getIndexWriter() {
69 if(indexWriter == null || !indexWriter.isOpen()) {
70 try {
71 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(getAnalyzer());
72 indexWriter = new IndexWriter(directory, indexWriterConfig);
73 } catch (Exception e) {
74 e.printStackTrace();
75 }
76 }
77 return indexWriter;
78 }
79
80 //获取索引Reader
81 public static IndexReader getIndexReader() {
82 try {
83 if(indexReader == null) {
84 indexReader = DirectoryReader.open(directory);
85 } else {
86 //对比索引库是否更新,更新则使用更新后的Reader
87 IndexReader newIndexReader = DirectoryReader.openIfChanged((DirectoryReader) indexReader);
88 if(newIndexReader != null) {
89 indexReader.close();
90 indexReader = newIndexReader;
91 }
92 }
93 } catch (Exception e) {
94 e.printStackTrace();
95 }
96 return indexReader;
97 }
98
99 //获取索引Searcher
100 public static IndexSearcher getIndexSearcher() {
101 indexSearcher = new IndexSearcher(getIndexReader());
102 return indexSearcher;
103 }
104
105 //打印索引文档(表)
106 public static void printDocument(Document document) {
107 System.out.println(document);
108 List<IndexableField> fieldList = document.getFields();
109 for(IndexableField field : fieldList) {
110 System.out.println(field.name() + " : " + field.stringValue());
111 }
112 }
113
114 //打印命中文档
115 public static void printScoreDoc(ScoreDoc scoreDoc) {
116 int docId = scoreDoc.doc;
117 System.out.println("文档编号:" + docId);
118 System.out.println("文档得分:" + scoreDoc.score);
119 try {
120 Document document = indexSearcher.doc(docId);
121 printDocument(document);
122 } catch (Exception e) {
123 e.printStackTrace();
124 }
125 }
126
127 //打印带得分的命中文档
128 public static void printTopDocs(TopDocs topDocs) {
129 int totalHits = topDocs.totalHits;
130 System.out.println("命中文档总条数:" + totalHits);
131 System.out.println("命中文档最大分数:" + topDocs.getMaxScore());
132 ScoreDoc[] scoreDocs = topDocs.scoreDocs;
133 for(ScoreDoc scoreDoc : scoreDocs) {
134 printScoreDoc(scoreDoc);
135 }
136 }
137
138 //高亮打印命中文档
139 public static void printTopDocsHighlight(TopDocs topDocs, Query query) {
140 // 格式化器:参数1:前置标签,参数2:后置标签
141 Formatter formatter = new SimplehtmlFormatter("<em>", "</em>");
142 //打分对象,参数:query里面的条件,条件里面有搜索关键词
143 Scorer scorer = new QueryScorer(query);
144 //高亮工具:参数1.需要高亮什么颜色, 参数2.将哪些关键词进行高亮
145 Highlighter hightlighter = new Highlighter(formatter, scorer);
146 try {
147 for(ScoreDoc scoreDoc : topDocs.scoreDocs) {
148 Document document = getIndexSearcher().doc(scoreDoc.doc);
149 List<IndexableField> fieldList = document.getFields();
150 for(IndexableField field : fieldList) {
151 String highlightValue = hightlighter.getBestFragment(getAnalyzer(), field.name(), field.stringValue());
152 if(highlightValue == null) {
153 highlightValue = field.stringValue();
154 }
155 System.out.println(field.name() + " : " + highlightValue);
156 }
157 }
158 } catch (Exception e) {
159 e.printStackTrace();
160 }
161 }
162
163 }
LuceneDAO.java
1 package io.guangsoft.erp.dao;
2
3 import org.apache.lucene.search.TopDocs;
4
5 import java.util.Map;
6
7 public interface LuceneDAO {
8
9 public void insertDoc(Map<String, String> docMap) throws Exception;
10
11 public void deleteDoc(String id) throws Exception;
12
13 public void updateDoc(Map<String, String> docMap) throws Exception;
14
15 public void insertOrUpdateDoc(Map<String, String> docMap) throws Exception;
16
17 //严格匹配整个字段,可传多个字段
18 public TopDocs searchDocsByTerm(Map<String, String> termMap) throws Exception;
19
20 //匹配分词后的字段,可传多个字段
21 public TopDocs searchDocsByParser(Map<String, String> parserMap) throws Exception;
22
23 }
LuceneDAOImpl.java
1 package io.guangsoft.erp.dao.impl;
2
3 import io.guangsoft.erp.dao.LuceneDAO;
4 import io.guangsoft.erp.util.LuceneUtil;
5 import org.apache.lucene.document.Document;
6 import org.apache.lucene.document.Field;
7 import org.apache.lucene.document.FieldType;
8 import org.apache.lucene.index.IndexOptions;
9 import org.apache.lucene.index.IndexWriter;
10 import org.apache.lucene.index.Term;
11 import org.apache.lucene.queryparser.classic.QueryParser;
12 import org.apache.lucene.queryparser.classic.QueryParserBase;
13 import org.apache.lucene.search.*;
14
15 import java.util.Map;
16
17 public class LuceneDAOImpl implements LuceneDAO {
18
19 @Override
20 public void insertDoc(Map<String, String> docMap) throws Exception {
21 FieldType fieldType = new FieldType();
22 //是否存储记录
23 fieldType.setStored(true);
24 //文档型索引,只索引文档,不支持打分和位置检索
25 fieldType.setIndexOptions(IndexOptions.DOCS);
26 //是否要忽略field的加权基准值,如果为true可以节省内存消耗
27 //但在打分质量方面会有更高的消耗,也不能使用index-time进行加权操作。
28 fieldType.setOmitNorms(true);
29 //是否使用分析器将域值分解成独立的语汇单元流,是否分词
30 fieldType.setTokenized(true);
31 //lucene索引库的一条记录
32 Document document = new Document();
33 for(Map.Entry<String, String> entry : docMap.entrySet()) {
34 Field field = new Field(entry.getKey(), entry.getValue(), fieldType);
35 document.add(field);
36 }
37 //保存到索引库
38 IndexWriter indexWriter = LuceneUtil.getIndexWriter();
39 indexWriter.addDocument(document);
40 indexWriter.close();
41 }
42
43 @Override
44 public void deleteDoc(String id) throws Exception {
45 IndexWriter indexWriter = LuceneUtil.getIndexWriter();
46 Term term = new Term("id", id);
47 indexWriter.deleteDocuments(term);
48 indexWriter.forceMergeDeletes();
49 indexWriter.commit();
50 indexWriter.close();
51 }
52
53 @Override
54 public void updateDoc(Map<String, String> docMap) throws Exception {
55 FieldType fieldType = new FieldType();
56 fieldType.setStored(true);
57 fieldType.setIndexOptions(IndexOptions.DOCS);
58 fieldType.setOmitNorms(true);
59 fieldType.setTokenized(true);
60 Document document = new Document();
61 for(Map.Entry<String, String> entry : docMap.entrySet()) {
62 Field field = new Field(entry.getKey(), entry.getValue(), fieldType);
63 document.add(field);
64 }
65 Term term = new Term("id", docMap.get("id"));
66 IndexWriter indexWriter = LuceneUtil.getIndexWriter();
67 indexWriter.updateDocument(term, document);
68 indexWriter.close();
69 }
70
71 @Override
72 public void insertOrUpdateDoc(Map<String, String> docMap) throws Exception {
73 Term term = new Term("id", docMap.get("id"));
74 TermQuery termQuery = new TermQuery(term);
75 TopDocs topDocs = LuceneUtil.getIndexSearcher().search(termQuery, 1);
76 if(topDocs.totalHits == 0) {
77 insertDoc(docMap);
78 } else {
79 updateDoc(docMap);
80 }
81 }
82
83 @Override
84 public TopDocs searchDocsByTerm(Map<String, String> termMap) throws Exception {
85 BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
86 for(Map.Entry<String, String> termEntry : termMap.entrySet()) {
87 Term term = new Term(termEntry.getKey(), termEntry.getValue());
88 TermQuery termQuery = new TermQuery(term);
89 booleanQueryBuilder.add(termQuery, BooleanClause.Occur.MUST);
90 }
91 BooleanQuery booleanQuery = booleanQueryBuilder.build();
92 //是否开启特定字段排序
93 boolean orderFlag = false;
94 TopDocs topDocs = null;
95 if(orderFlag) {
96 Sort sort = new Sort(new SortField[]{new SortField("createTime", SortField.Type.LONG, true)});
97 topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999, sort);
98 } else {
99 topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999);
100 }
101 return topDocs;
102 }
103
104 @Override
105 public TopDocs searchDocsByParser(Map<String, String> parserMap) throws Exception {
106 BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
107 for(Map.Entry<String, String> parserEntry : parserMap.entrySet()) {
108 QueryParser queryParser = new QueryParser(parserEntry.getKey(), LuceneUtil.getAnalyzer());
109 queryParser.setDefaultOperator(QueryParserBase.AND_OPERATOR);
110 Query query = queryParser.parse(parserEntry.getValue());
111 booleanQueryBuilder.add(query, BooleanClause.Occur.MUST);
112 }
113 BooleanQuery booleanQuery = booleanQueryBuilder.build();
114 //是否开启特定字段排序
115 boolean orderFlag = false;
116 TopDocs topDocs = null;
117 if(orderFlag) {
118 Sort sort = new Sort(new SortField[]{new SortField("createTime", SortField.Type.LONG, true)});
119 topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999, sort);
120 } else {
121 topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999);
122 }
123 return topDocs;
124 }
125
126 }
LuceneTest.java
1 package io.guangsoft.erp;
2
3 import com.alibaba.fastjson.JSONArray;
4 import com.alibaba.fastjson.JSONObject;
5 import io.guangsoft.erp.dao.LuceneDAO;
6 import io.guangsoft.erp.dao.impl.LuceneDAOImpl;
7 import io.guangsoft.erp.util.LuceneUtil;
8 import org.apache.lucene.index.Term;
9 import org.apache.lucene.search.TermQuery;
10 import org.apache.lucene.search.TopDocs;
11 import org.junit.Test;
12
13 import java.util.HashMap;
14 import java.util.Map;
15 import java.util.stream.Collectors;
16
17 public class LuceneTest {
18
19 LuceneDAO luceneDAO = new LuceneDAOImpl();
20
21 @Test
22 public void testInsertDoc() throws Exception {
23 JSONArray jsonArray = JSONArray.parseArray(
24 "[{id:‘1‘,name:‘李白‘,desc:‘朝辞白帝彩云间‘}, " +
25 "{id:‘2‘,name:‘杜甫‘,desc:‘润物细无声‘}, " +
26 "{id:‘3‘,name:‘苏轼‘,desc:‘大江东去浪淘尽‘}]");
27 for(int i = 0; i < jsonArray.size(); i++) {
28 JSONObject jsonObject = jsonArray.getJSONObject(i);
29 Map<String, String> docMap = jsonObject.entrySet().stream().collect(Collectors.toMap(
30 Map.Entry :: getKey, entry -> entry.getValue().toString()
31 ));
32 luceneDAO.insertDoc(docMap);
33 }
34 }
35
36 @Test
37 public void testSearchDocsByTerm() throws Exception {
38 Map<String, String> docMap = new HashMap<String, String>();
39 docMap.put("name", "李白");
40 TopDocs topDocs = luceneDAO.searchDocsByTerm(docMap);
41 LuceneUtil.printTopDocs(topDocs);
42 }
43
44 @Test
45 public void testSearchDocsByParser() throws Exception {
46 Map<String, String> docMap = new HashMap<String, String>();
47 docMap.put("name", "李白");
48 TopDocs topDocs = luceneDAO.searchDocsByParser(docMap);
49 LuceneUtil.printTopDocsHighlight(topDocs, new TermQuery(new Term("name", "李白")));
50 }
51
52 @Test
53 public void testUpdateDoc() throws Exception {
54 Map<String, String> docMap = new HashMap<String, String>();
55 docMap.put("name", "李白");
56 LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
57 docMap.put("id", "1");
58 docMap.put("desc", "人生得意须尽欢");
59 luceneDAO.updateDoc(docMap);
60 docMap.remove("id");
61 docMap.remove("desc");
62 LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
63 }
64
65 @Test
66 public void testDeleteDoc() throws Exception{
67 Map<String, String> docMap = new HashMap<String, String>();
68 docMap.put("id", "1");
69 LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
70 luceneDAO.deleteDoc("1");
71 LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
72 }
73 }
以上是关于Lucene全文检索的主要内容,如果未能解决你的问题,请参考以下文章