使用Lucene.Net实现全文检索

Posted dotNET跨平台

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用Lucene.Net实现全文检索相关的知识,希望对你有一定的参考价值。

Lucene.net是Lucene的.net移植版本,是一个开源的全文检索引擎开发包,即它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎。

开发人员可以基于Lucene.net实现全文检索的功能。

Lucene.net是Apache软件基金会赞助的开源项目,基于Apache License协议。

Lucene.net并不是一个爬行搜索引擎,也不会自动地索引内容。我们得先将要索引的文档中的文本抽取出来,然后再将其加到Lucene.net索引中。标准的步骤是先初始化一个Analyzer、打开一个IndexWriter、然后再将文档一个接一个地加进去。一旦完成这些步骤,索引就可以在关闭前得到优化,同时所做的改变也会生效。这个过程可能比开发者习惯的方式更加手工化一些,但却在数据的索引上给予你更多的灵活性,而且其效率也很高。

获取索引目录

    /// <summary>
        /// 获取索引目录
        /// </summary>
        /// <param name="index">索引类型</param>
        /// <returns>索引目录</returns>
        private LcStore.Directory GetLuceneDirectory(IndexType index)
        {
            var indexPath = string.Empty;
            try
            {
                var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");

                var indexName = Enum.EnumHelper.GetEnumDescription(index);

                indexPath = Path.Combine(dirPath, indexName);

                return LcStore.FSDirectory.Open(indexPath);
            }
            catch (Exception ex)
            {
                NLogger.Write($"获取索引目录失败" + Environment.NewLine +
                              $"路径:{indexPath}" + Environment.NewLine +
                              $"异常信息:{ex}",
                             "Lucene", "x", "x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("获取索引目录异常,详情请查看相关日志");
            }
        }

        #endregion 获取目录

盘古分词

   /// <summary>
        /// 盘古分词
        /// </summary>
        /// <param name="keyword">语句</param>
        /// <returns>词组集合</returns>
        public string[] GetSplitKeywords(string keyword)
        {
            try
            {
                string ret = null;
                var reader = new StringReader(keyword);
                var ts = PanguAnalyzer.TokenStream(keyword, reader);
                var hasNext = ts.IncrementToken();
                Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
                while (hasNext)
                {
                    ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                    ret += ita.Term + "|";
                    hasNext = ts.IncrementToken();
                }
                ts.CloneAttributes();
                reader.Close();
                PanguAnalyzer.Close();

                if (string.IsNullOrWhiteSpace(ret)) return null;

                ret = ret.Substring(0, ret.Length - 1);
                return ret.Split('|');
            }
            catch (Exception ex)
            {
                NLogger.Write("分词异常" + Environment.NewLine +
                              $"关键词:{keyword}" + Environment.NewLine +
                              $"异常信息:{ex}",
                             "Lucene", "x", "x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("分词出现异常,详情请查看相关日志");
            }
        }

        #endregion 分词

创建索引或追加索引

     /// <summary>
        /// 创建索引或追加索引
        /// </summary>
        /// <param name="dataList">数据集合</param>
        /// <param name="index">索引类型</param>
        public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)
        {
            if (dataList == null || dataList.Count == 0)
                return;

            IndexWriter writer;
            var directory = GetLuceneDirectory(index);
            try
            {
                //false表示追加(true表示删除之前的重新写入)
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
            }
            catch
            {
                //false表示追加(true表示删除之前的重新写入)
                writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);
            }
            writer.MergeFactor = 1000;
            //writer.SetMaxBufferedDocs(1000);
            foreach (var doc in dataList)
            {
                writer.AddDocument(doc);
            }
            writer.Optimize();

            writer.Dispose();
            directory.Dispose();
        }

完整代码

 /// <summary>
    /// Lucene搜索引擎帮助类
    /// </summary>
    public class LuceneHelper
    {
        /// <summary>
        /// 私有构造函数
        /// </summary>
        private LuceneHelper()
        {
        }

        #region 属性

        private static LuceneHelper _instance;

        /// <summary>
        /// 单一实例
        /// </summary>
        public static LuceneHelper Instance => _instance ?? (_instance = new LuceneHelper());

        private Analyzer _analyzer;

        /// <summary>
        /// 分析器
        /// </summary>
        private Analyzer PanguAnalyzer => _analyzer ?? (_analyzer = new PanGuAnalyzer());

        #endregion 属性

        #region 获取目录

        /// <summary>
        /// 获取索引目录
        /// </summary>
        /// <param name="index">索引类型</param>
        /// <returns>索引目录</returns>
        private LcStore.Directory GetLuceneDirectory(IndexType index)
        {
            var indexPath = string.Empty;
            try
            {
                var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");

                var indexName = Enum.EnumHelper.GetEnumDescription(index);

                indexPath = Path.Combine(dirPath, indexName);

                return LcStore.FSDirectory.Open(indexPath);
            }
            catch (Exception ex)
            {
                NLogger.Write($"获取索引目录失败" + Environment.NewLine +
                              $"路径:{indexPath}" + Environment.NewLine +
                              $"异常信息:{ex}",
                             "Lucene", "x", "x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("获取索引目录异常,详情请查看相关日志");
            }
        }

        #endregion 获取目录

        #region 分词

        /// <summary>
        /// 盘古分词
        /// </summary>
        /// <param name="keyword">语句</param>
        /// <returns>词组集合</returns>
        public string[] GetSplitKeywords(string keyword)
        {
            try
            {
                string ret = null;
                var reader = new StringReader(keyword);
                var ts = PanguAnalyzer.TokenStream(keyword, reader);
                var hasNext = ts.IncrementToken();
                Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
                while (hasNext)
                {
                    ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                    ret += ita.Term + "|";
                    hasNext = ts.IncrementToken();
                }
                ts.CloneAttributes();
                reader.Close();
                PanguAnalyzer.Close();

                if (string.IsNullOrWhiteSpace(ret)) return null;

                ret = ret.Substring(0, ret.Length - 1);
                return ret.Split('|');
            }
            catch (Exception ex)
            {
                NLogger.Write("分词异常" + Environment.NewLine +
                              $"关键词:{keyword}" + Environment.NewLine +
                              $"异常信息:{ex}",
                             "Lucene", "x", "x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("分词出现异常,详情请查看相关日志");
            }
        }

        #endregion 分词

        #region 索引增删改查

        /// <summary>
        /// 创建索引或追加索引
        /// </summary>
        /// <param name="dataList">数据集合</param>
        /// <param name="index">索引类型</param>
        public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)
        {
            if (dataList == null || dataList.Count == 0)
                return;

            IndexWriter writer;
            var directory = GetLuceneDirectory(index);
            try
            {
                //false表示追加(true表示删除之前的重新写入)
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
            }
            catch
            {
                //false表示追加(true表示删除之前的重新写入)
                writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);
            }
            writer.MergeFactor = 1000;
            //writer.SetMaxBufferedDocs(1000);
            foreach (var doc in dataList)
            {
                writer.AddDocument(doc);
            }
            writer.Optimize();

            writer.Dispose();
            directory.Dispose();
        }

        /// <summary>
        /// 删除索引
        /// </summary>
        /// <param name="field">字段名</param>
        /// <param name="value">字段值</param>
        /// <param name="index">索引类型</param>
        public void DeleteIndexes(string field, string value, IndexType index)
        {
            IndexWriter writer = null;
            var directory = GetLuceneDirectory(index);
            try
            {
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
                var term = new Term(field, value);
                writer.DeleteDocuments(term);
                //var isSuccess = writer.HasDeletions();
                writer.Optimize();
            }
            catch (Exception ex)
            {
                NLogger.Write("删除索引异常" + Environment.NewLine +
                              $"异常信息:{ex}", "Lucene", "x", "x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("删除索引异常,详情请查看相关日志");
            }
            finally
            {
                writer?.Dispose();
                directory?.Dispose();
            }
        }

        /// <summary>
        /// 更新索引;这里实际上是先删除原有索引,在创建新索引。
        /// 所以在更新索引时,一定要确保传入的Document的所有字段都有值
        /// 否则将会被置为空
        /// </summary>
        /// <param name="field">字段名</param>
        /// <param name="value">字段值</param>
        /// <param name="doc">文档</param>
        /// <param name="index">索引类型</param>
        public void UpdateIndexes(string field, string value, Document doc, IndexType index)
        {
            IndexWriter writer = null;
            var directory = GetLuceneDirectory(index);
            try
            {
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
                var term = new Term(field, value);
                writer.UpdateDocument(term, doc);
            }
            catch (Exception ex)
            {
                NLogger.Write("更新索引异常" + Environment.NewLine +
                              $"异常信息:{ex}", "Lucene", "x", "x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("更新索引异常,详情请查看相关日志");
            }
            finally
            {
                writer?.Dispose();
                directory?.Dispose();
            }
        }

        #endregion 索引增删改查

        #region 查询

        /// <summary>
        /// 查询
        /// </summary>
        /// <typeparam name="T">实体类型</typeparam>
        /// <param name="fields">条件字段</param>
        /// <param name="keywords">关键词组</param>
        /// <param name="index">索引类型</param>
        /// <param name="sort">排序,可为空</param>
        /// <param name="count">读取数量</param>
        /// <returns>结果集</returns>
        public List<T> Search<T>
            (
            string[] fields,
            string[] keywords,
            IndexType index,
            Sort sort,
            int count
            ) where T : new()
        {
            if (fields == null || fields.Length == 0)
                return null;
            if (keywords == null || keywords.Length == 0)
                return null;

            //索引目录
            var directory = GetLuceneDirectory(index);

            //查询条件
            var boolQuery = GetQuery(fields, keywords);

            //索引查询器
            var searcher = new IndexSearcher(directory, true);

            TopDocs docs;
            if (sort != null)
                docs = searcher.Search(boolQuery, null, count, sort);
            else
                docs = searcher.Search(boolQuery, count);
            if (docs == null || docs.TotalHits == 0)
                return null;

            //文档集合
            var docList = docs.ScoreDocs.Select(sd => searcher.Doc(sd.Doc)).ToList();

            //反射赋值
            var list = ConvertDocToObj<T>(docList);

            searcher.Dispose();
            directory.Dispose();

            return list;
        }

        /// <summary>
        /// 查询分页数据(指定排序方式)
        /// </summary>
        /// <typeparam name="T">实体类型</typeparam>
        /// <param name="fields">条件字段</param>
        /// <param name="keywords">关键词组</param>
        /// <param name="index">索引类型</param>
        /// <param name="sort">排序,必填</param>
        /// <param name="pageNumber">页码</param>
        /// <param name="pageSize">页数</param>
        /// <returns>结果集</returns>
        public PagedResult<List<T>> SearchByPaged<T>
            (
            string[] fields,
            string[] keywords,
            IndexType index,
            Sort sort,
            int pageNumber = 1,
            int pageSize = 20
            ) where T : new()
        {
            if (fields == null || fields.Length == 0)
                return null;
            if (keywords == null || keywords.Length == 0)
                return null;

            //索引目录
            var directory = GetLuceneDirectory(index);

            //查询条件
            var boolQuery = GetQuery(fields, keywords);

            var collector = TopFieldCollector
                .Create(sort, pageNumber * pageSize, false, false, false, false);

            var searcher = new IndexSearcher(directory, true);

            searcher.Search(boolQuery, collector);

            if (collector == null || collector.TotalHits == 0)
                return null;

            //分页
            var start = (pageNumber - 1) * pageSize;
            var limit = pageSize;
            var hits = collector.TopDocs(start, limit).ScoreDocs;
            var totalCount = collector.TotalHits;

            var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();

            //反射赋值
            var list = ConvertDocToObj<T>(docList);

            searcher.Dispose();
            directory.Dispose();

            return new PagedResult<List<T>>
            {
                Total = totalCount,
                Result = list
            };
        }

        /// <summary>
        /// 查询分页数据(默认排序方式)
        /// </summary>
        /// <typeparam name="T">实体类型</typeparam>
        /// <param name="fields">条件字段</param>
        /// <param name="keywords">关键词组</param>
        /// <param name="index">索引类型</param>
        /// <param name="pageNumber">页码</param>
        /// <param name="pageSize">页数</param>
        /// <returns>结果集</returns>
        public PagedResult<List<T>> SearchByPaged<T>
            (
            string[] fields,
            string[] keywords,
            IndexType index,
            int pageNumber = 1,
            int pageSize = 20
            ) where T : new()
        {
            if (fields == null || fields.Length == 0)
                return null;
            if (keywords == null || keywords.Length == 0)
                return null;

            //索引目录
            var directory = GetLuceneDirectory(index);

            //查询条件
            var boolQuery = GetQuery(fields, keywords);

            var collector = TopScoreDocCollector.Create(pageNumber * pageSize, false);
            var searcher = new IndexSearcher(directory, true);

            searcher.Search(boolQuery, collector);

            if (collector == null || collector.TotalHits == 0)
                return null;

            //分页
            var start = (pageNumber - 1) * pageSize;
            var limit = pageSize;
            var hits = collector.TopDocs(start, limit).ScoreDocs;
            var totalCount = collector.TotalHits;

            var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();

            //反射赋值
            var list = ConvertDocToObj<T>(docList);

            searcher.Dispose();
            directory.Dispose();

            return new PagedResult<List<T>>
            {
                Total = totalCount,
                Result = list
            };
        }

        /// <summary>
        /// 查询分页数据(默认排序方式)
        /// </summary>
        /// <param name="fields">条件字段</param>
        /// <param name="keywords">关键词组</param>
        /// <param name="index">索引类型</param>
        /// <returns>结果集</returns>
        public int GetTotla(string[] fields, string[] keywords, IndexType index)
        {
            if (fields == null || fields.Length == 0)
                return 0;
            if (keywords == null || keywords.Length == 0)
                return 0;

            //索引目录
            var directory = GetLuceneDirectory(index);

            //查询条件
            var boolQuery = GetQuery(fields, keywords);

            var collector = TopScoreDocCollector.Create(20, false);
            var searcher = new IndexSearcher(directory, true);

            searcher.Search(boolQuery, collector);

            if (collector == null || collector.TotalHits == 0)
                return 0;

            searcher.Dispose();
            directory.Dispose();

            return collector.TotalHits;
        }

        /// <summary>
        /// 文档转换为对象
        /// </summary>
        /// <typeparam name="T">实体类型</typeparam>
        /// <param name="docList">文档集合</param>
        /// <returns>对象集合</returns>
        private List<T> ConvertDocToObj<T>(List<Document> docList) where T : new()
        {
            var type = typeof(T);
            var propertyList = type.GetProperties(BindingFlags.Public | BindingFlags.Instance);

            var list = new List<T>();
            var firstDoc = docList.First();
            var fieldNames = firstDoc.GetFields().Select(x => x.Name).ToList();

            foreach (var doc in docList)
            {
                var tObj = new T();
                foreach (var pInfo in propertyList)
                {
                    var name = pInfo.Name;
                    if (fieldNames.Any(x => x.ToLower() == name.ToLower()))
                    {
                        SetValue<T>(pInfo, tObj, doc, name);
                    }
                }

                list.Add(tObj);
            }
            return list;
        }

        /// <summary>
        /// 获取查询条件
        /// </summary>
        /// <param name="fields">条件字段</param>
        /// <param name="keywords">关键词组</param>
        /// <returns></returns>
        private BooleanQuery GetQuery(string[] fields, string[] keywords)
        {
            var boolQuery = new BooleanQuery();
            foreach (var field in fields)
            {
                foreach (var keyword in keywords)
                {
                    var t = new TermQuery(new Term(field, keyword));
                    boolQuery.Add(t, Occur.SHOULD);
                }
            }
            return boolQuery;
        }

        #endregion 查询

        private void SetValue<T>(PropertyInfo pInfo, T tObj, Document doc, string name)
        {
            var pType = pInfo.PropertyType.Name;
            switch (pType)
            {
                case "String":
                    pInfo.SetValue(tObj, doc.Get(name), null);
                    break;

                case "Int32":
                    pInfo.SetValue(tObj, GetInt(doc.Get(name)), null);
                    break;

                case "Boolean":
                    pInfo.SetValue(tObj, GetBool(doc.Get(name)), null);
                    break;

                case "DateTime":
                    pInfo.SetValue(tObj, GetDate(doc.Get(name)), null);
                    break;

                case "Double":
                    pInfo.SetValue(tObj, GetDouble(doc.Get(name)), null);
                    break;

                case "Single":
                    pInfo.SetValue(tObj, GetFloat(doc.Get(name)), null);
                    break;

                case "Decimal":
                    pInfo.SetValue(tObj, GetDecimal(doc.Get(name)), null);
                    break;
            }
        }

        private int GetInt(string value)
        {
            var result = 0;
            int.TryParse(value, out result);
            return result;
        }

        private DateTime GetDate(string value)
        {
            DateTime result;
            DateTime.TryParse(value, out result);
            return result;
        }

        private bool GetBool(string value)
        {
            bool result;
            bool.TryParse(value, out result);
            return result;
        }

        private double GetDouble(string value)
        {
            double result;
            double.TryParse(value, out result);
            return result;
        }

        private float GetFloat(string value)
        {
            float result;
            float.TryParse(value, out result);
            return result;
        }

        private decimal GetDecimal(string value)
        {
            decimal result;
            decimal.TryParse(value, out result);
            return result;
        }
    }

以上是关于使用Lucene.Net实现全文检索的主要内容,如果未能解决你的问题,请参考以下文章

Code Index: 基于Lucene.Net的代码检索工具

使用Lucene.Net做一个简单的搜索引擎-全文索引

转Lucene.NET详细使用与优化详解

Lucene的索引系统和搜索过程分析

Lucene.Net-全文检索

(VIP-朝夕教育)2021-06-06 .NET高级班 39-搜索引擎Lucene的使用