lucene自定义同义词实现
Posted 王南辉
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了lucene自定义同义词实现相关的知识,希望对你有一定的参考价值。
lucene同义词搜索原理其实是根据 PositionIncrementAttribute 和 CharTermAttribute的次元记录信息来实现的,当前使用lucene版本为4.8.0首先同义词要实现
package lucene_index;
import java.io.IOException;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class MySameFiter extends TokenFilter {
Stack<String> stack = null;
private CharTermAttribute cta = null; //词元信息
private PositionIncrementAttribute position = null; // 词元位置信息
private AttributeSource.State current; // 记录当前的词元位置状态
private Map<String, String[]> map ; // 同义词表
protected MySameFiter(TokenStream input,Map<String, String[]> map ) {
super(input);
stack = new Stack<>();
cta = input.addAttribute(CharTermAttribute.class);
position = input.addAttribute(PositionIncrementAttribute.class);
this.map = map ;
}
@Override
public boolean incrementToken() throws IOException {
//同义词操作
while (stack.size() > 0) {
String word = stack.pop();
restoreState(current);
cta.setEmpty();
cta.append(word);
position.setPositionIncrement(0);
return true;
}
//判断是否有下一个分词
if (!input.incrementToken()) {
return false;
}
//获取当前的状态
if (getSameWrds(cta.toString())) {
current = captureState();
}
return true;
}
private boolean getSameWrds(String words) {
String[] arr = map.get(words);
if (arr != null) {
for (String word : arr) {
stack.push(word);
}
return true;
}
return false;
}
}
自定义分词器
3.测试
package lucene_index;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKTokenizer;
public class StopWrodsAnalyse extends Analyzer{
private Map<String, String[]> map = new HashMap<String, String[]>();
// private CharArraySet set = null;
public StopWrodsAnalyse(Map<String, String[]> map ){
//for(Map.Entry<String, String []> entry : map.entrySet()){
// set = StopFilter.makeStopSet(Version.LUCENE_48, entry.getValue(),true);
// }
// set.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
this.map = map ;
}
@Override
protected TokenStreamComponents createComponents(String words, Reader reader) {
Tokenizer source = new IKTokenizer(reader, false);
TokenStream stream = new MySameFiter(source,map);//将自定义的filter传入词库的话用ik的
// stream = new StopFilter(Version.LUCENE_48, stream, set);
return new TokenStreamComponents(source,stream);
}
}
package lucene_index;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import javax.print.Doc;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class MainTest {
public static void main(String[] args) throws IOException, ParseException {
LineIterator it = FileUtils.lineIterator(new File("E://searchwork_custom//data_index//ConfigFile//ExpansionWord.csv"),"gbk");
Map<String, String []> map = new HashMap<String, String[]>();
while (it.hasNext()) {
String word = it.nextLine();
String [] wordArr = word.replace("-,", "").trim().split("\\,");
if(map.containsKey(wordArr[0]))
continue;
map.put(wordArr[0], wordArr);
}
Analyzer analyzer = new StopWrodsAnalyse(map);
Directory directory = FSDirectory.open(new File("E:\\luceneindex"));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
IndexWriter writer = new IndexWriter(directory, config);
Collection<Document> coll = new ArrayList<Document>();
for (Map.Entry<String, String []> entry : map.entrySet()) {
Document docss = new Document();
Field field = new Field("name", entry.getKey(),Store.YES,Index.ANALYZED);
docss.add(field);
coll.add(docss);
}
writer.addDocuments(coll);
writer.commit();
writer.close();
IndexSearcher searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File("E:\\luceneindex"))));
// QueryParser parser = new QueryParser(Version.LUCENE_48, "name", analyzer);
search(searcher);
//WordInfo.getWordInfo(word, analyzer);
}
public static void search(IndexSearcher searcher) throws IOException{
Query q = new TermQuery(new Term("name","中国建设银行"));
System.out.println(q);
TopDocs doc = searcher.search(q, 10);
ScoreDoc [] docs = doc.scoreDocs;
for (int i = 0; i < docs.length; i++) {
Document d = searcher.doc(docs[i].doc);
System.out.println(d.get("name"));
}
}
}
3.测试
当搜建行建设银行中国建设银行时建行或者建设银行时
以上是关于lucene自定义同义词实现的主要内容,如果未能解决你的问题,请参考以下文章
在 Elasticsearch 中加载自定义同义词文件时出错