当前位置: 首页 > 工具软件 > Ja.Net > 使用案例 >

Lucene.Net C#分词操作帮助类

贾兴学
2023-12-01
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.PanGu;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace ReptileJob.Functions
{
    /// <summary>
    /// 分词操作类
    /// </summary>
    public class FenCi
    {
        /// <summary>
        /// 索引存放目录
        /// </summary>
        protected string IndexDic
        {
            get
            {
                return "D://Fenci/SuoYin/IndexDic";
            }
        }

        /// <summary>
        /// 创建索引
        /// </summary>
        /// <param name="dataId"></param>
        /// <param name="dataTitle"></param>
        /// <param name="dataContent"></param>
        /// <param name="code">SRB,其实就是传过来的business</param>
        /// <param name="tableName"></param>
        /// <param name="link"></param>
        /// <param name="source"></param>
        /// <param name="catalog"></param>
        /// <param name="type">1集团,2业务系统</param>
        /// <param name="companyNo">001002</param>
        public void CreateIndex(string dataId, string dataTitle, string dataContent,string code,string tableName,string link,string source,string catalog, string type,string companyNo,string picUrl=null,string nopop=null)
        {
            if (dataTitle==null)
            {
                dataTitle = "";
            }

            if (dataContent == null)
            {
                dataTitle = "";
            }

            if (code == null)
            {
                dataTitle = "";
            }

            if (link == null)
            {
                dataTitle = "";
            }

            if (source == null)
            {
                source = "";
            }

            if (catalog == null)
            {
                catalog = "";
            }

            if (type == null)
            {
                type = "";
            }

            if (companyNo == null)
            {
                companyNo = "";
            }

            if (picUrl == null)
            {
                picUrl = "";
            }

            if (nopop == null)
            {
                nopop = "";
            }

            //首先创建文件目录
            if (!System.IO.Directory.Exists(IndexDic))
            {
                System.IO.Directory.CreateDirectory(IndexDic);
            }

            //判断是否有锁
            if (IndexWriter.IsLocked(IndexDic))
            {
                //  如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
                //  Lucene.Net在写索引库之前会自动加锁,在close的时候会自动解锁
                Lucene.Net.Store.Directory direcotry = FSDirectory.GetDirectory("IndexDic");
                IndexWriter.Unlock(direcotry);
            }

            bool isCreated;

            //判断是否重新创建索引文件
            if (File.Exists(IndexDic+ "/segments.gen"))
            {
                isCreated = false;
            }
            else
            {
                isCreated = true;
            }

            //创建索引
            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), isCreated, IndexWriter.MaxFieldLength.LIMITED);
            try
            {
                Document doc = new Document();
                Field postid = new Field("DataId", dataId, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field title = new Field("Title", dataTitle, Field.Store.YES, Field.Index.ANALYZED);
                Field postscore = new Field("DataContent", dataContent, Field.Store.YES, Field.Index.ANALYZED);
                Field addTime = new Field("AddTime", DateTime.Now.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Code = new Field("Code", code, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field TableName = new Field("TableName", tableName, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Link = new Field("Link", link, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Source = new Field("Source", source, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Type = new Field("Type", type, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field Catalog = new Field("Catalog", catalog, Field.Store.YES, Field.Index.NOT_ANALYZED);
                //Field Business = new Field("Business", business, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field CompanyNo = new Field("CompanyNo", companyNo, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field PicUrl = new Field("PicUrl", picUrl, Field.Store.YES, Field.Index.NOT_ANALYZED);
                Field NoPop = new Field("NoPop", nopop, Field.Store.YES, Field.Index.NOT_ANALYZED);
                doc.Add(postid);
                doc.Add(title);
                doc.Add(postscore);
                doc.Add(addTime);
                doc.Add(Code);
                doc.Add(TableName);
                doc.Add(Link);
                doc.Add(Source);
                doc.Add(Type);
                doc.Add(Catalog);
                //doc.Add(Business);
                doc.Add(CompanyNo);
                doc.Add(PicUrl);
                doc.Add(NoPop);
                writer.AddDocument(doc);

                writer.Optimize();
                writer.Commit();
            }catch(Exception ex)
            {
                throw ex;
            }
            finally
            {
                //关闭锁
                writer.Close();
            }
        }//方法结束

        /// <summary>
        /// 检索方法
        /// </summary>
        /// <param name="keyword"></param>
        /// <param name="pageIndex"></param>
        /// <param name="pageSize"></param>
        /// <param name="catalog"></param>
        /// <returns></returns>
        public List<SearchResult> Search(string keyword,int pageIndex,int pageSize, out int totalCount, string catalog = null)
        {
            //首先创建文件目录
            if (!System.IO.Directory.Exists(IndexDic))
            {
                System.IO.Directory.CreateDirectory(IndexDic);
            }

            //判断是否重新创建索引文件
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                totalCount = 0;
                return new List<SearchResult>();
            }
           

            if (!string.IsNullOrEmpty(keyword))
            {
                string[] keywords = keyword.Split(' ');
                BooleanQuery boolQuery = new BooleanQuery();
                IndexSearcher searcher = new IndexSearcher(IndexDic, true);
                Sort sort = new Sort(new SortField("Catalog", SortField.STRING_VAL, false));
                List<SearchResult> searchResults = new List<SearchResult>();
                foreach (string keywordItem in keywords)
                {
                    if (!string.IsNullOrEmpty(keywordItem))
                    {                        
                        QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "DataContent", new PanGuAnalyzer());                       
                        Query query = parser.Parse(keywordItem);
                        //Query query = new TermQuery(new Term("DataContent", keywordItem));

                        //should的话就是只命中其中一个关键字
                        //boolQuery.Add(query, BooleanClause.Occur.SHOULD);

                        //must是全部命中
                        boolQuery.Add(query, BooleanClause.Occur.MUST);
                    }
                }

                if (!string.IsNullOrEmpty(catalog))
                {
                    string[] catalogs = catalog.Split(',');
                    List<string> allCatalogs = new List<string>() { "1", "2", "3","4"};
                    List<string> NotContainCatalogs = allCatalogs.Where(x => !catalogs.Contains(x)).ToList();
                    foreach (string ss in NotContainCatalogs)
                    {
                        Query query = new TermQuery(new Term("Catalog", ss));
                        boolQuery.Add(query, BooleanClause.Occur.MUST_NOT);
                    }
                }

                TopDocs docs = searcher.Search(boolQuery, null, pageIndex * pageSize, sort);
                //int Count = searcher.MaxDoc();
                int Count = docs.totalHits;
                totalCount = Count;
                if (docs != null && docs.totalHits > 0)
                {
                    for (int i = 0; i < docs.totalHits; i++)
                    {
                        if (i >= (pageIndex - 1) * pageSize && i < pageIndex * pageSize)
                        {
                            SearchResult searchResult = new SearchResult();
                            Document doc = searcher.Doc(docs.scoreDocs[i].doc);

                            searchResult.Id = doc.Get("DataId")?.ToString();
                            searchResult.Title = doc.Get("Title")?.ToString();
                            searchResult.Content = doc.Get("DataContent")?.ToString();
                            searchResult.Code = doc.Get("Code")?.ToString();
                            searchResult.TableName = doc.Get("TableName")?.ToString();
                            searchResult.Link = doc.Get("Link")?.ToString();
                            searchResult.Source = doc.Get("Source")?.ToString();
                            if (!string.IsNullOrEmpty(doc.Get("AddTime")?.ToString()))
                            {
                                searchResult.AddTime = DateTime.Parse(doc.Get("AddTime")?.ToString());
                            }

                            if (!string.IsNullOrEmpty(doc.Get("Type")?.ToString()))
                            {
                                searchResult.Type = int.Parse(doc.Get("Type")?.ToString());
                            }

                            if (!string.IsNullOrEmpty(doc.Get("Catalog")?.ToString()))
                            {
                                searchResult.Catalog = int.Parse(doc.Get("Catalog")?.ToString());
                            }
                            searchResult.CompanyNo = doc.Get("CompanyNo")?.ToString();
                            searchResult.PicUrl = doc.Get("PicUrl")?.ToString();
                            searchResult.NoPop = doc.Get("NoPop")?.ToString();
                            //高亮显示
                            searchResult.Content = SimpleHighLighter(searchResult.Content, keyword, "<font style=\"color: red; font - family:\'Cambria\';\"><b>","</b></font>");
                            searchResult.Title = SimpleHighLighter(searchResult.Title, keyword, "<font style=\"color: red; font - family:\'Cambria\';\"><b>", "</b></font>");
                            searchResults.Add(searchResult);
                        }
                    }
                }

                return searchResults;
            }
            else
            {
                totalCount = 0;
                return new List<SearchResult>();
            }
        }//方法结束

        /// <summary>
        /// 删除,用于去重二级单位调接口
        /// </summary>
        /// <param name="id"></param>
        /// <param name="business"></param>
        public void Delete2Company(string id,string business) 
        {

            //判断是否有数据
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                return;
            }

            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
            BooleanQuery boolQuery = new BooleanQuery();
            try
            {
                Query query = new TermQuery(new Term("DataId", id));
                Query query1 = new TermQuery(new Term("Code", business));
                boolQuery.Add(query, BooleanClause.Occur.MUST);
                boolQuery.Add(query1, BooleanClause.Occur.MUST);
                writer.DeleteDocuments(boolQuery);
            }
            catch(Exception ex)
            {
                throw ex;
            }
            finally
            {
                writer.Close();
            }
        }//方法结束

        /// <summary>
        /// 去重业务数据
        /// </summary>
        /// <param name="id"></param>
        /// <param name="tableName"></param>
        public void Delete(string id, string tableName)
        {
            //判断是否有数据
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                return;
            }
            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
            BooleanQuery boolQuery = new BooleanQuery();
            try
            {
                Query query = new TermQuery(new Term("DataId", id));
                Query query1 = new TermQuery(new Term("TableName", tableName));
                boolQuery.Add(query, BooleanClause.Occur.MUST);
                boolQuery.Add(query1, BooleanClause.Occur.MUST);
                writer.DeleteDocuments(boolQuery);
            }catch(Exception ex)
            {
                throw ex;
            }
            finally
            {
                writer.Close();
            }
        }//方法结束

        /// <summary>
        /// 去重功能
        /// </summary>
        /// <param name="id"></param>
        /// <param name="link"></param>
        /// <param name="flag"></param>
        public void Delete(string id, string link,bool flag=false)
        {
            //判断是否有数据
            if (!File.Exists(IndexDic + "/segments.gen"))
            {
                return;
            }
            IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
            BooleanQuery boolQuery = new BooleanQuery();
            try
            {
                Query query = new TermQuery(new Term("DataId", id));
                Query query1 = new TermQuery(new Term("Link", link));
                boolQuery.Add(query, BooleanClause.Occur.MUST);
                boolQuery.Add(query1, BooleanClause.Occur.MUST);
                writer.DeleteDocuments(boolQuery);
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                writer.Close();
            }
        }//方法结束


        /// <summary>
        /// 高亮显示
        /// </summary>
        /// <param name="p_Body"></param>
        /// <param name="p_KeyWords"></param>
        /// <param name="p_Before"></param>
        /// <param name="p_After"></param>
        /// <param name="p_MaxLength"></param>
        /// <returns></returns>
        public string SimpleHighLighter(string p_Body, string p_KeyWords, string p_Before,
            string p_After, int p_MaxLength=0)
        {
            string[] KeyWords = p_KeyWords.Trim().Split(' ');
            for (int i = 0; i < KeyWords.Length; i++)
            {
                if (!string.IsNullOrEmpty(KeyWords[i]))
                {
                    p_Body = p_Body.Replace(KeyWords[i], p_Before + KeyWords[i] + p_After);
                }

            }
            return p_Body;

        }


        /// <summary>
        /// 利用盘古分词对用户输入的内容进行分词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public List<string> GetPanGuWord(string str)
        {
            List<string> list = new List<string>();
            Analyzer analyzer = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str));
            Lucene.Net.Analysis.Token token = null;
            while ((token = tokenStream.Next()) != null)
            {
                list.Add(token.TermText());
            }
            return list;
        }
    }

    /// <summary>
    /// 搜索结果
    /// </summary>
    public class SearchResult
    {
        public string NoPop { get; set; }
        public string PicUrl { get; set; }

        public string Id { get; set; }

        public string Title { get; set; }

        public string Content { get; set; }

        public DateTime AddTime { get; set; }

        public string Code { get; set; }

        public string TableName { get; set; }

        public string Link { get; set; }

        public string Source { get; set; }

        public int Type { get; set; }

        public int Catalog { get; set; }

        public string CompanyNo { get; set; }
    }

 

}

 类似资料: