C#,Lucene,PanGu全文检索,减少数据压力并实现口语化非标准查询

2021-05-30 更新 整个流程
1、读取数据库数据
2、分词(我在这里用的是盘古分词)
3、单独存在硬盘上
4、基于Lucene单独存的文件上查找,不是数据库查找

实例源码下载地址: https://pan.baidu.com/s/1cl88QsYp_zhY-yclORZHlg 提取码: s39x
相关的视频教程 https://www.bilibili.com/video/BV1K44y1k7Vq?share_source=copy_web
//引用的第三方库有以下5个
//PanGu.Lucene.Analyzer
//PanGu
//PanGu.HighLight
//Lucene.Net
//ICSharpCode.SharpZipLib
盘古分词类
public class LuceneAnalyze
{
    #region AnalyzerKey
    /// <summary>
    /// 将搜索的keyword分词
    /// </summary>
    /// <param name="keyword"></param>
    /// <returns></returns>
    public string[] AnalyzerKey(string keyword)
    {
        Analyzer analyzer = new PanGuAnalyzer();
        QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", analyzer);
        Query query = parser.Parse(this.CleanKeyword(keyword));
        if (query is TermQuery)
        {
            Term term = ((TermQuery)query).Term;
            return new string[] { term.Text };
        }
        else if (query is PhraseQuery)
        {
            Term[] term = ((PhraseQuery)query).GetTerms();
            return term.Select(t => t.Text).ToArray();
        }
        else if (query is BooleanQuery)
        {
            BooleanClause[] clauses = ((BooleanQuery)query).GetClauses();
            List<string> analyzerWords = new List<string>();
            foreach (BooleanClause clause in clauses)
            {
                Query childQuery = clause.Query;
                if (childQuery is TermQuery)
                {
                    Term term = ((TermQuery)childQuery).Term;
                    analyzerWords.Add(term.Text);
                }
                else if (childQuery is PhraseQuery)
                {
                    Term[] term = ((PhraseQuery)childQuery).GetTerms();
                    analyzerWords.AddRange(term.Select(t => t.Text));
                }
            }
            return analyzerWords.ToArray();
        }
        else
        {
            return new string[] { keyword };
        }
    }

    /// <summary>
    /// 清理头尾and or 关键字
    /// </summary>
    /// <param name="keyword"></param>
    /// <returns></returns>
    private string CleanKeyword(string keyword)
    {
        if (string.IsNullOrWhiteSpace(keyword))
        { }
        else
        {
            bool isClean = false;
            while (!isClean)
            {
                keyword = keyword.Trim();
                if (keyword.EndsWith(" AND"))
                {
                    keyword = string.Format("{0}and", keyword.Remove(keyword.Length - 3, 3));
                }
                else if (keyword.EndsWith(" OR"))
                {
                    keyword = string.Format("{0}or", keyword.Remove(keyword.Length - 2, 2));
                }
                else if (keyword.StartsWith("AND "))
                {
                    keyword = string.Format("and{0}", keyword.Substring(3));
                }
                else if (keyword.StartsWith("OR "))
                {
                    keyword = string.Format("or{0}", keyword.Substring(2));
                }
                else if (keyword.Contains(" OR "))
                {
                    keyword = keyword.Replace(" OR ", " or ");
                }
                else if (keyword.Contains(" AND "))
                {
                    keyword = keyword.Replace(" AND ", " and ");
                }
                else
                    isClean = true;
            }
        }
        return QueryParser.Escape(keyword);
    }
    #endregion AnalyzerKey
}


Lucene初始化和搜索的方法

/// <summary>
/// 为keyword做盘古分词
/// </summary>
/// <param name="keyword"></param>
/// <param name="luceneQuery"></param>
/// <returns></returns>
private string AnalyzerKeyword(string keyword)
{
    StringBuilder queryStringBuilder = new StringBuilder();
    LuceneAnalyze analyzer = new LuceneAnalyze();
    string[] words = analyzer.AnalyzerKey(keyword);
    if (words.Length == 1)
    {
        queryStringBuilder.AppendFormat("{0}:{1}* ", "title", words[0]);
    }
    else
    {
        //string.Join(" ", words.Select(w => $"title:{w}"));
        foreach (string word in words)
        {
            queryStringBuilder.AppendFormat("{0}:{1} ", "title", word);
        }
    }
    string result = queryStringBuilder.ToString().TrimEnd();
    //logger.Info(string.Format("AnalyzerKeyword 将 keyword={0}转换为{1}", keyword, result));
    return result;
}
private FSDirectory CreateFSDirectory()
{
    string dirPath = AppDomain.CurrentDomain.BaseDirectory + "\\LuceneData";//文件夹   在这里是把数据写入了硬盘 也可以放在内存
    if (!System.IO.Directory.Exists(dirPath))
    {
        System.IO.Directory.CreateDirectory(dirPath);
    }
    return FSDirectory.Open(dirPath);
}
/// <summary>
/// 初始化 将数据写到本地
/// </summary>
public void InitIndex()
{
    DataTable dt = GetList();//从数据库获取数据库数据
    if (dt == null || dt.Rows.Count < 1) { return; }//没有读到数据则不做处理
    FSDirectory directory = CreateFSDirectory();
    using (IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED))//索引写入器
    {
        foreach (DataRow dr in dt.Rows)
        {
            //在这里我只加了 id和title   我们也可以加入创建时间  文章内容等等
            Document doc = new Document();//一条数据
            doc.Add(new Field("id", dr[0].ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));//一个字段  列名  值   是否保存值  是否分词
            doc.Add(new Field("title", dr[1].ToString(), Field.Store.YES, Field.Index.ANALYZED));
            writer.AddDocument(doc);//写进去
        }
        writer.Optimize();//优化  就是合并
    }
}
/// <summary>
/// 获取数据库数据
/// </summary>
/// <returns></returns>
private DataTable GetList()
{
    //因为数据库存数据不是很多  在这里全读取出来了  如果数据大  可以分批处理
    DataSet ds = SQLiteHelper.ExecuteQuery("select ID,Title  from Article");
    if (ds != null && ds.Tables.Count > 0)
    {
        return ds.Tables[0];
    }
    return null;
}
/// <summary>
/// 多个词组查询
/// </summary>
/// <param name="keyword"></param>
public void SearchData(string keyword)
{
    FSDirectory dir = CreateFSDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);//查找器

    QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
    Query query = parser.Parse(AnalyzerKeyword(keyword));
    TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
    int i = 0;
    foreach (ScoreDoc sd in docs.ScoreDocs)
    {
        if (i++ < 1000) //查询最多1000  我们也可以做分页查询
        {
            Document doc = searcher.Doc(sd.Doc);
            Console.WriteLine("***************************************");
            Console.WriteLine(string.Format("id={0}", doc.Get("id")));
            Console.WriteLine(string.Format("title={0}", doc.Get("title")));
            //在这里我只加了 id和title   我们也可以加入创建时间  文章内容等等
        }
    }
    Console.WriteLine($"一共命中{docs.TotalHits}");
}
/// <summary>
/// 单个词查询
/// </summary>
public void SingelSearchData()
{
    FSDirectory dir = CreateFSDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);//查找器
    TermQuery query = new TermQuery(new Term("title", "图书馆"));//包含
    TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
    foreach (ScoreDoc sd in docs.ScoreDocs)
    {
        Document doc = searcher.Doc(sd.Doc);
        Console.WriteLine("***************************************");
        Console.WriteLine(string.Format("id={0}", doc.Get("id")));
        Console.WriteLine(string.Format("title={0}", doc.Get("title")));
        Console.WriteLine(string.Format("time={0}", doc.Get("time")));
        Console.WriteLine(string.Format("price={0}", doc.Get("price")));
        Console.WriteLine(string.Format("content={0}", doc.Get("content")));
    }
    Console.WriteLine("1一共命中了{0}个", docs.TotalHits);

}
/// <summary>
/// 带排序多个词查询
/// </summary>
public void OrderSearchData(string keyword)
{
    FSDirectory dir = CreateFSDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);//查找器
    QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
    Query query = parser.Parse(keyword);
    NumericRangeFilter<int> timeFilter = NumericRangeFilter.NewIntRange("time", 20180000, 20181822, true, true);//过滤
    SortField sortPrice = new SortField("price", SortField.DOUBLE, false);//降序
    SortField sortTime = new SortField("time", SortField.INT, true);//升序
    Sort sort = new Sort(sortTime, sortPrice);//排序 哪个前哪个后

    TopDocs docs = searcher.Search(query, timeFilter, 10000, sort);//找到的数据
    int i = 0;
    foreach (ScoreDoc sd in docs.ScoreDocs)
    {
        if (i++ < 1000)//查询最多1000  我们也可以做分页查询
        {
            Document doc = searcher.Doc(sd.Doc);
            Console.WriteLine("***************************************");
            Console.WriteLine(string.Format("id={0}", doc.Get("id")));
            Console.WriteLine(string.Format("title={0}", doc.Get("title")));
            Console.WriteLine(string.Format("time={0}", doc.Get("time")));
            Console.WriteLine(string.Format("price={0}", doc.Get("price")));
        }
    }
    Console.WriteLine("3一共命中了{0}个", docs.TotalHits);
}