Lucene的QueryParser
Posted 闫广庆
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Lucene的QueryParser相关的知识,希望对你有一定的参考价值。
这个里面我们将会详细的讲到这个QueryParser究竟应该怎么样去进行合理的使用。
查询解析器和解析框架是官网中的介绍,那么QueryParser是如何实现的呢。
接下来我就用一段代码去讲解 QueryParser究竟是如何进行实现的。
package HelloLucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.sandbox.queries.regex.RegexQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
public final class SelectLucene {
private final static String dir = "./lucene";
/**
* 词条搜索(单个关键字查找)
* 主要对象是TermQuery,调用方式如下:
* Term term=new Term(字段名, 搜索关键字);
* Query query=new TermQuery(term);
* Hits hits=searcher.search(query);
*
* @param file 字段名
* @param keyWord 搜索关键字
* @return 文档集合
* @throws Exception 抛出异常
*/
public List<Document> termQuery(String file, String keyWord) throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
Term term = new Term( file, keyWord );
Query query = new TermQuery( term );
TopDocs topDocs = searcher.search( query, 1000 );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
LinkedHashMap<Float, Document> documentLinkedHashMap = new LinkedHashMap<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
float score = scDoc.score; //相似度
documentLinkedHashMap.put( score, document );
}
List<Document> documents = new ArrayList<>( documentLinkedHashMap.values() );
reader.close();
return documents;
}
/**
* 组合搜索(允许多个关键字组合搜索)
*
* @param combinatorialSearches 多条件数据返回
* @param backCount 返回条数
* @throws Exception 抛出异常
*/
public List<Document> booleanQuery(List<CombinatorialSearch> combinatorialSearches, int backCount) throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
List<BooleanClause> booleanClauses = new ArrayList<>();
combinatorialSearches.forEach( combinatorialSearch -> {
Query query1 = new TermQuery( new Term( combinatorialSearch.getFileName(), combinatorialSearch.getContext() ) );
BooleanClause bc1 = new BooleanClause( query1, combinatorialSearch.getStrategy() );
booleanClauses.add( bc1 );
} );
BooleanQuery.Builder builder = new BooleanQuery.Builder();
booleanClauses.forEach( builder::add );
BooleanQuery boolQuery = builder.build();
TopDocs topDocs = searcher.search( boolQuery, backCount );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
LinkedHashMap<Float, Document> documentLinkedHashMap = new LinkedHashMap<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
float score = scDoc.score; //相似度
documentLinkedHashMap.put( score, document );
}
List<Document> documents = new ArrayList<>( documentLinkedHashMap.values() );
reader.close();
return documents;
}
/**
* 范围搜索(允许搜索指定范围内的关键字结果)
* 主要对象是TermRangeQuery,调用方式如下:
* TermRangeQuery rangequery=new TermRangeQuery(字段名, 起始值, 终止值, 起始值是否包含边界, 终止值是否包含边界);
* Hits hits=searcher.search(rangequery);
* 此方法中的参数是Boolean类型的,表示是否包含边界 。
* true 包含边界
* false不包含边界
*
* @param rangeSearch 范围查询
* @throws Exception 抛出异常
*/
public List<Document> rangeQuery(RangeSearch rangeSearch) throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
TermRangeQuery timeQuery = new TermRangeQuery( rangeSearch.getSearchName(), new BytesRef( rangeSearch.getLimitLow() ), new BytesRef( rangeSearch.getLimitHigh() ), rangeSearch.getLowerBoundBoundary(), rangeSearch.getUpperBoundBoundary() );
TopDocs topDocs = searcher.search( timeQuery, 1000 );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
/**
* 前缀搜索(搜索起始位置符合要求的结果)
* 主要对象是PrefixQuery,调用方式如下:
* Term term=new Term(字段名, 搜索关键字);
* PrefixQuery prefixquery=new PrefixQuery(term);
* Hits hits=searcher.search(prefixquery);
*
* @param fileName 搜索库的名字
* @param text 需要搜索前缀的数据
* @throws Exception 文件查询时异常处理
*/
public List<Document> prefixQuery(String fileName, String text) throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
Term term = new Term( fileName, text );
PrefixQuery prefixquery = new PrefixQuery( term );
TopDocs topDocs = searcher.search( prefixquery, 1000 );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
/**
* 短语搜索(根据零碎的短语组合成新的词组进行搜索)
* 其中setSlop的参数是设置两个关键字之间允许间隔的最大值。
*
* @param RecoverCount 返回条数
* @throws Exception 查询时抛出异常
*/
public List<Document> phraseQuery(List<PhraseSearch> phraseSearches, int RecoverCount) throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
PhraseQuery.Builder builder = new PhraseQuery.Builder();
phraseSearches.forEach( phraseSearch -> builder.add( new Term( phraseSearch.getFileName(), phraseSearch.getContext() ), phraseSearch.getInterval() ) );
PhraseQuery pq = builder.build();
TopDocs topDocs = searcher.search( pq, RecoverCount );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
/**
* 多短语搜索(先指定一个前缀关键字,然后其他的关键字加在此关键字之后,组成词语进行搜索)
* 主要对象是MultiPhraseQuery,调用方式如下:
* Term term=new Term(字段名,前置关键字);
* Term term1=new Term(字段名,搜索关键字);
* Term term2=new Term(字段名,搜索关键字);
* MultiPhraseQuery multiPhraseQuery=new MultiPhraseQuery();
* multiPhraseQuery.add(term);
* multiPhraseQuery.add(new Term[]{term1, term2});
* Hits hits=searcher.search(multiPhraseQuery);
*
* @throws Exception
*/
public List<Document> multiPhraseQuery() throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
//查询“计张”、“计钦”组合的关键词,先指定一个前缀关键字,然后其他的关键字加在此关键字之后,组成词语进行搜索
Term term = new Term( "name", "计" ); //前置关键字
Term term1 = new Term( "name", "张" ); //搜索关键字
Term term2 = new Term( "name", "钦" ); //搜索关键字
MultiPhraseQuery multiPhraseQuery = new MultiPhraseQuery();
multiPhraseQuery.add( term );
multiPhraseQuery.add( new Term[]{term1, term2} );
TopDocs topDocs = searcher.search( multiPhraseQuery, 1000 );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
/**
* 模糊搜索(顾名思义)
* 主要对象是FuzzyQuery,调用方式如下:
* Term term=new Term(字段名, 搜索关键字);
* FuzzyQuery fuzzyquery=new FuzzyQuery(term,参数);
* Hits hits=searcher.search(fuzzyquery);
* 此中的参数是表示模糊度,是小于1的浮点小数,比如0.5f
*
* @throws Exception
*/
public List<Document> fuzzyQuery(FuzzySearch fuzzySearch, int returnCount) throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
Term term = new Term( fuzzySearch.getFileName(), fuzzySearch.getContext() );
FuzzyQuery fuzzyquery = new FuzzyQuery( term, fuzzySearch.getFuzziness() );
TopDocs topDocs = searcher.search( fuzzyquery, returnCount );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
/**
* 通配符搜索(顾名思义)
* 主要对象是:WildcardQuery,调用方式如下:
* Term term=new Term(字段名,搜索关键字+通配符);
* WildcardQuery wildcardquery=new WildcardQuery(term);
* Hits hits=searcher.search(wildcardquery);
* 其中的通配符分两种,即*和?
* * 表示任意多的自负
* ?表示任意一个字符
*
* @throws Exception
*/
public List<Document> wildcardQuery() throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
Term term = new Term( "name", "三?" );
WildcardQuery wildcardQuery = new WildcardQuery( term );
TopDocs topDocs = searcher.search( wildcardQuery, 1000 );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
/**
* 正则表达式搜索(顾名思义,这个类引入lucene-queries-3.5.0.jar包)
* 主要对象是:RegexQuery,调用方式如下
* String regex = ".*";
* Term term = new Term (search_field_name, regex);
* RegexQuery query = new RegexQuery (term);
* TopDocs hits = searcher.search (query, 100);
*
* @throws Exception
*/
public List<Document> regexQuery() throws Exception {
Directory directory = FSDirectory.open( Paths.get( dir ) );
IndexReader reader = DirectoryReader.open( directory );
IndexSearcher searcher = new IndexSearcher( reader );
String regex = "林*";
Term term = new Term( "name", regex );
RegexQuery query = new RegexQuery( term );
TopDocs topDocs = searcher.search( query, 1000 );
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<Document> documents = new ArrayList<>();
for (ScoreDoc scDoc : scoreDocs) {
Document document = searcher.doc( scDoc.doc );
documents.add( document );
}
reader.close();
return documents;
}
}
以上是关于Lucene的QueryParser的主要内容,如果未能解决你的问题,请参考以下文章
Lucene系列:(10)多条件搜索 QueryParser
Lucene 高阶查询的六脉神剑 —— QueryParser
Lucene.Net(3.0.3 或 4.8.0)QueryParser 可以搜索数字吗?