Lucene全文检索

Posted guanghe

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Lucene全文检索相关的知识,希望对你有一定的参考价值。

POM.xml

 1 <!--Lucene全文检索-->
 2         <dependency>
 3             <groupId>org.apache.lucene</groupId>
 4             <artifactId>lucene-core</artifactId>
 5             <version>${lucene.version}</version>
 6         </dependency>
 7         <dependency>
 8             <groupId>org.apache.lucene</groupId>
 9             <artifactId>lucene-queryparser</artifactId>
10             <version>${lucene.version}</version>
11         </dependency>
12         <dependency>
13             <groupId>org.apache.lucene</groupId>
14             <artifactId>lucene-analyzers-common</artifactId>
15             <version>${lucene.version}</version>
16         </dependency>
17         <dependency>
18             <groupId>org.apache.lucene</groupId>
19             <artifactId>lucene-highlighter</artifactId>
20             <version>${lucene.version}</version>
21         </dependency>
22         <!--中文分词器-->
23         <dependency>
24             <groupId>org.apache.lucene</groupId>
25             <artifactId>lucene-analyzers-smartcn</artifactId>
26             <version>${lucene.version}</version>
27         </dependency>

LuceneUtil.java

  1 package io.guangsoft.erp.util;
  2 
  3 import org.apache.lucene.analysis.Analyzer;
  4 import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  5 import org.apache.lucene.document.Document;
  6 import org.apache.lucene.index.*;
  7 import org.apache.lucene.search.IndexSearcher;
  8 import org.apache.lucene.search.Query;
  9 import org.apache.lucene.search.ScoreDoc;
 10 import org.apache.lucene.search.TopDocs;
 11 import org.apache.lucene.search.highlight.*;
 12 import org.apache.lucene.store.Directory;
 13 import org.apache.lucene.store.FSDirectory;
 14 
 15 import java.nio.file.Paths;
 16 import java.util.List;
 17 
 18 public class LuceneUtil {
 19     //索引目录位置
 20     private static final String INDEX_DIR = "/index";
 21     //索引文件存放目录对象
 22     private static Directory directory;
 23     //分词器对象
 24     private static Analyzer analyzer;
 25     //索引写对象,线程安全
 26     private static IndexWriter indexWriter;
 27     //索引读对象,线程安全
 28     private static IndexReader indexReader;
 29     //索引搜索对象,线程安全
 30     private static IndexSearcher indexSearcher;
 31 
 32     static {
 33         try {
 34             directory = FSDirectory.open(Paths.get(INDEX_DIR));
 35             //系统关闭前关闭索引库的流
 36             Runtime.getRuntime().addShutdownHook(new Thread() {
 37                 @Override
 38                 public void run() {
 39                     try {
 40                         if(indexWriter != null) {
 41                             indexWriter.close();
 42                         }
 43                         if(indexReader != null) {
 44                             indexReader.close();
 45                         }
 46                         if(directory != null) {
 47                             directory.close();
 48                         }
 49                     } catch (Exception e) {
 50                         e.printStackTrace();
 51                     }
 52                 }
 53             });
 54         } catch (Exception e) {
 55             e.printStackTrace();
 56         }
 57     }
 58 
 59     //获取分词器
 60     public static Analyzer getAnalyzer() {
 61         if(analyzer == null) {
 62             analyzer = new SmartChineseAnalyzer();
 63         }
 64         return analyzer;
 65     }
 66 
 67     //获取索引Writer
 68     public static IndexWriter getIndexWriter() {
 69         if(indexWriter == null || !indexWriter.isOpen()) {
 70             try {
 71                 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(getAnalyzer());
 72                 indexWriter = new IndexWriter(directory, indexWriterConfig);
 73             } catch (Exception e) {
 74                 e.printStackTrace();
 75             }
 76         }
 77         return indexWriter;
 78     }
 79 
 80     //获取索引Reader
 81     public static IndexReader getIndexReader() {
 82         try {
 83             if(indexReader == null) {
 84                 indexReader = DirectoryReader.open(directory);
 85             } else {
 86                 //对比索引库是否更新,更新则使用更新后的Reader
 87                 IndexReader newIndexReader = DirectoryReader.openIfChanged((DirectoryReader) indexReader);
 88                 if(newIndexReader != null) {
 89                     indexReader.close();
 90                     indexReader = newIndexReader;
 91                 }
 92             }
 93         } catch (Exception e) {
 94             e.printStackTrace();
 95         }
 96         return indexReader;
 97     }
 98 
 99     //获取索引Searcher
100     public static IndexSearcher getIndexSearcher() {
101         indexSearcher = new IndexSearcher(getIndexReader());
102         return indexSearcher;
103     }
104 
105     //打印索引文档(表)
106     public static void printDocument(Document document) {
107         System.out.println(document);
108         List<IndexableField> fieldList = document.getFields();
109         for(IndexableField field : fieldList) {
110             System.out.println(field.name() + " : " + field.stringValue());
111         }
112     }
113 
114     //打印命中文档
115     public static void printScoreDoc(ScoreDoc scoreDoc) {
116         int docId = scoreDoc.doc;
117         System.out.println("文档编号:" + docId);
118         System.out.println("文档得分:" + scoreDoc.score);
119         try {
120             Document document = indexSearcher.doc(docId);
121             printDocument(document);
122         } catch (Exception e) {
123             e.printStackTrace();
124         }
125     }
126 
127     //打印带得分的命中文档
128     public static void printTopDocs(TopDocs topDocs) {
129         int totalHits = topDocs.totalHits;
130         System.out.println("命中文档总条数:" + totalHits);
131         System.out.println("命中文档最大分数:" + topDocs.getMaxScore());
132         ScoreDoc[] scoreDocs = topDocs.scoreDocs;
133         for(ScoreDoc scoreDoc : scoreDocs) {
134             printScoreDoc(scoreDoc);
135         }
136     }
137 
138     //高亮打印命中文档
139     public static void printTopDocsHighlight(TopDocs topDocs, Query query) {
140         // 格式化器:参数1:前置标签,参数2:后置标签
141         Formatter formatter = new SimplehtmlFormatter("<em>", "</em>");
142         //打分对象,参数:query里面的条件,条件里面有搜索关键词
143         Scorer scorer = new QueryScorer(query);
144         //高亮工具:参数1.需要高亮什么颜色, 参数2.将哪些关键词进行高亮
145         Highlighter hightlighter = new Highlighter(formatter, scorer);
146         try {
147             for(ScoreDoc scoreDoc : topDocs.scoreDocs) {
148                 Document document = getIndexSearcher().doc(scoreDoc.doc);
149                 List<IndexableField> fieldList = document.getFields();
150                 for(IndexableField field : fieldList) {
151                     String highlightValue = hightlighter.getBestFragment(getAnalyzer(), field.name(), field.stringValue());
152                     if(highlightValue == null) {
153                         highlightValue = field.stringValue();
154                     }
155                     System.out.println(field.name() + " : " + highlightValue);
156                 }
157             }
158         } catch (Exception e) {
159             e.printStackTrace();
160         }
161     }
162 
163 }

LuceneDAO.java

 1 package io.guangsoft.erp.dao;
 2 
 3 import org.apache.lucene.search.TopDocs;
 4 
 5 import java.util.Map;
 6 
 7 public interface LuceneDAO {
 8 
 9     public void insertDoc(Map<String, String> docMap) throws Exception;
10 
11     public void deleteDoc(String id) throws Exception;
12 
13     public void updateDoc(Map<String, String> docMap) throws Exception;
14 
15     public void insertOrUpdateDoc(Map<String, String> docMap) throws Exception;
16 
17     //严格匹配整个字段,可传多个字段
18     public TopDocs searchDocsByTerm(Map<String, String> termMap) throws Exception;
19 
20     //匹配分词后的字段,可传多个字段
21     public TopDocs searchDocsByParser(Map<String, String> parserMap) throws Exception;
22 
23 }

LuceneDAOImpl.java

  1 package io.guangsoft.erp.dao.impl;
  2 
  3 import io.guangsoft.erp.dao.LuceneDAO;
  4 import io.guangsoft.erp.util.LuceneUtil;
  5 import org.apache.lucene.document.Document;
  6 import org.apache.lucene.document.Field;
  7 import org.apache.lucene.document.FieldType;
  8 import org.apache.lucene.index.IndexOptions;
  9 import org.apache.lucene.index.IndexWriter;
 10 import org.apache.lucene.index.Term;
 11 import org.apache.lucene.queryparser.classic.QueryParser;
 12 import org.apache.lucene.queryparser.classic.QueryParserBase;
 13 import org.apache.lucene.search.*;
 14 
 15 import java.util.Map;
 16 
 17 public class LuceneDAOImpl implements LuceneDAO {
 18 
 19     @Override
 20     public void insertDoc(Map<String, String> docMap) throws Exception {
 21         FieldType fieldType = new FieldType();
 22         //是否存储记录
 23         fieldType.setStored(true);
 24         //文档型索引,只索引文档,不支持打分和位置检索
 25         fieldType.setIndexOptions(IndexOptions.DOCS);
 26         //是否要忽略field的加权基准值,如果为true可以节省内存消耗
 27         //但在打分质量方面会有更高的消耗,也不能使用index-time进行加权操作。
 28         fieldType.setOmitNorms(true);
 29         //是否使用分析器将域值分解成独立的语汇单元流,是否分词
 30         fieldType.setTokenized(true);
 31         //lucene索引库的一条记录
 32         Document document = new Document();
 33         for(Map.Entry<String, String> entry : docMap.entrySet()) {
 34             Field field = new Field(entry.getKey(), entry.getValue(), fieldType);
 35             document.add(field);
 36         }
 37         //保存到索引库
 38         IndexWriter indexWriter = LuceneUtil.getIndexWriter();
 39         indexWriter.addDocument(document);
 40         indexWriter.close();
 41     }
 42 
 43     @Override
 44     public void deleteDoc(String id) throws Exception {
 45         IndexWriter indexWriter = LuceneUtil.getIndexWriter();
 46         Term term = new Term("id", id);
 47         indexWriter.deleteDocuments(term);
 48         indexWriter.forceMergeDeletes();
 49         indexWriter.commit();
 50         indexWriter.close();
 51     }
 52 
 53     @Override
 54     public void updateDoc(Map<String, String> docMap) throws Exception {
 55         FieldType fieldType = new FieldType();
 56         fieldType.setStored(true);
 57         fieldType.setIndexOptions(IndexOptions.DOCS);
 58         fieldType.setOmitNorms(true);
 59         fieldType.setTokenized(true);
 60         Document document = new Document();
 61         for(Map.Entry<String, String> entry : docMap.entrySet()) {
 62             Field field = new Field(entry.getKey(), entry.getValue(), fieldType);
 63             document.add(field);
 64         }
 65         Term term = new Term("id", docMap.get("id"));
 66         IndexWriter indexWriter = LuceneUtil.getIndexWriter();
 67         indexWriter.updateDocument(term, document);
 68         indexWriter.close();
 69     }
 70 
 71     @Override
 72     public void insertOrUpdateDoc(Map<String, String> docMap) throws Exception {
 73         Term term = new Term("id", docMap.get("id"));
 74         TermQuery termQuery = new TermQuery(term);
 75         TopDocs topDocs = LuceneUtil.getIndexSearcher().search(termQuery, 1);
 76         if(topDocs.totalHits == 0) {
 77             insertDoc(docMap);
 78         } else {
 79             updateDoc(docMap);
 80         }
 81     }
 82 
 83     @Override
 84     public TopDocs searchDocsByTerm(Map<String, String> termMap) throws Exception {
 85         BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
 86         for(Map.Entry<String, String> termEntry : termMap.entrySet()) {
 87             Term term = new Term(termEntry.getKey(), termEntry.getValue());
 88             TermQuery termQuery = new TermQuery(term);
 89             booleanQueryBuilder.add(termQuery, BooleanClause.Occur.MUST);
 90         }
 91         BooleanQuery booleanQuery = booleanQueryBuilder.build();
 92         //是否开启特定字段排序
 93         boolean orderFlag = false;
 94         TopDocs topDocs = null;
 95         if(orderFlag) {
 96             Sort sort = new Sort(new SortField[]{new SortField("createTime", SortField.Type.LONG, true)});
 97             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999, sort);
 98         } else {
 99             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999);
100         }
101         return topDocs;
102     }
103 
104     @Override
105     public TopDocs searchDocsByParser(Map<String, String> parserMap) throws Exception {
106         BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
107         for(Map.Entry<String, String> parserEntry : parserMap.entrySet()) {
108             QueryParser queryParser = new QueryParser(parserEntry.getKey(), LuceneUtil.getAnalyzer());
109             queryParser.setDefaultOperator(QueryParserBase.AND_OPERATOR);
110             Query query = queryParser.parse(parserEntry.getValue());
111             booleanQueryBuilder.add(query, BooleanClause.Occur.MUST);
112         }
113         BooleanQuery booleanQuery = booleanQueryBuilder.build();
114         //是否开启特定字段排序
115         boolean orderFlag = false;
116         TopDocs topDocs = null;
117         if(orderFlag) {
118             Sort sort = new Sort(new SortField[]{new SortField("createTime", SortField.Type.LONG, true)});
119             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999, sort);
120         } else {
121             topDocs = LuceneUtil.getIndexSearcher().search(booleanQuery, 99999999);
122         }
123         return topDocs;
124     }
125 
126 }

LuceneTest.java

 1 package io.guangsoft.erp;
 2 
 3 import com.alibaba.fastjson.JSONArray;
 4 import com.alibaba.fastjson.JSONObject;
 5 import io.guangsoft.erp.dao.LuceneDAO;
 6 import io.guangsoft.erp.dao.impl.LuceneDAOImpl;
 7 import io.guangsoft.erp.util.LuceneUtil;
 8 import org.apache.lucene.index.Term;
 9 import org.apache.lucene.search.TermQuery;
10 import org.apache.lucene.search.TopDocs;
11 import org.junit.Test;
12 
13 import java.util.HashMap;
14 import java.util.Map;
15 import java.util.stream.Collectors;
16 
17 public class LuceneTest {
18 
19     LuceneDAO luceneDAO = new LuceneDAOImpl();
20 
21     @Test
22     public void testInsertDoc() throws Exception {
23         JSONArray jsonArray = JSONArray.parseArray(
24                 "[{id:‘1‘,name:‘李白‘,desc:‘朝辞白帝彩云间‘}, " +
25                         "{id:‘2‘,name:‘杜甫‘,desc:‘润物细无声‘}, " +
26                         "{id:‘3‘,name:‘苏轼‘,desc:‘大江东去浪淘尽‘}]");
27         for(int i = 0; i < jsonArray.size(); i++) {
28             JSONObject jsonObject = jsonArray.getJSONObject(i);
29             Map<String, String> docMap = jsonObject.entrySet().stream().collect(Collectors.toMap(
30                     Map.Entry :: getKey, entry -> entry.getValue().toString()
31             ));
32             luceneDAO.insertDoc(docMap);
33         }
34     }
35 
36     @Test
37     public void testSearchDocsByTerm() throws Exception {
38         Map<String, String> docMap = new HashMap<String, String>();
39         docMap.put("name", "李白");
40         TopDocs topDocs = luceneDAO.searchDocsByTerm(docMap);
41         LuceneUtil.printTopDocs(topDocs);
42     }
43 
44     @Test
45     public void testSearchDocsByParser() throws Exception {
46         Map<String, String> docMap = new HashMap<String, String>();
47         docMap.put("name", "李白");
48         TopDocs topDocs = luceneDAO.searchDocsByParser(docMap);
49         LuceneUtil.printTopDocsHighlight(topDocs, new TermQuery(new Term("name", "李白")));
50     }
51 
52     @Test
53     public void testUpdateDoc() throws Exception {
54         Map<String, String> docMap = new HashMap<String, String>();
55         docMap.put("name", "李白");
56         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
57         docMap.put("id", "1");
58         docMap.put("desc", "人生得意须尽欢");
59         luceneDAO.updateDoc(docMap);
60         docMap.remove("id");
61         docMap.remove("desc");
62         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
63     }
64 
65     @Test
66     public void testDeleteDoc() throws Exception{
67         Map<String, String> docMap = new HashMap<String, String>();
68         docMap.put("id", "1");
69         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
70         luceneDAO.deleteDoc("1");
71         LuceneUtil.printTopDocs(luceneDAO.searchDocsByTerm(docMap));
72     }
73 }

 

以上是关于Lucene全文检索的主要内容,如果未能解决你的问题,请参考以下文章

Lucene的全文检索学习

Lucene学习总结

Lucene就是这么简单

Lucene就是这么简单

Lucene原理与代码分析解读笔记

[Lucene]-Lucene基本概述以及简单实例