using Lucene.Net.Analysis;
using Lucene.Net.Analysis.PanGu;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace ReptileJob.Functions
{
/// <summary>
/// 分词操作类
/// </summary>
public class FenCi
{
/// <summary>
/// 索引存放目录
/// </summary>
protected string IndexDic
{
get
{
return "D://Fenci/SuoYin/IndexDic";
}
}
/// <summary>
/// 创建索引
/// </summary>
/// <param name="dataId"></param>
/// <param name="dataTitle"></param>
/// <param name="dataContent"></param>
/// <param name="code">SRB,其实就是传过来的business</param>
/// <param name="tableName"></param>
/// <param name="link"></param>
/// <param name="source"></param>
/// <param name="catalog"></param>
/// <param name="type">1集团,2业务系统</param>
/// <param name="companyNo">001002</param>
public void CreateIndex(string dataId, string dataTitle, string dataContent,string code,string tableName,string link,string source,string catalog, string type,string companyNo,string picUrl=null,string nopop=null)
{
if (dataTitle==null)
{
dataTitle = "";
}
if (dataContent == null)
{
dataTitle = "";
}
if (code == null)
{
dataTitle = "";
}
if (link == null)
{
dataTitle = "";
}
if (source == null)
{
source = "";
}
if (catalog == null)
{
catalog = "";
}
if (type == null)
{
type = "";
}
if (companyNo == null)
{
companyNo = "";
}
if (picUrl == null)
{
picUrl = "";
}
if (nopop == null)
{
nopop = "";
}
//首先创建文件目录
if (!System.IO.Directory.Exists(IndexDic))
{
System.IO.Directory.CreateDirectory(IndexDic);
}
//判断是否有锁
if (IndexWriter.IsLocked(IndexDic))
{
// 如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
// Lucene.Net在写索引库之前会自动加锁,在close的时候会自动解锁
Lucene.Net.Store.Directory direcotry = FSDirectory.GetDirectory("IndexDic");
IndexWriter.Unlock(direcotry);
}
bool isCreated;
//判断是否重新创建索引文件
if (File.Exists(IndexDic+ "/segments.gen"))
{
isCreated = false;
}
else
{
isCreated = true;
}
//创建索引
IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), isCreated, IndexWriter.MaxFieldLength.LIMITED);
try
{
Document doc = new Document();
Field postid = new Field("DataId", dataId, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field title = new Field("Title", dataTitle, Field.Store.YES, Field.Index.ANALYZED);
Field postscore = new Field("DataContent", dataContent, Field.Store.YES, Field.Index.ANALYZED);
Field addTime = new Field("AddTime", DateTime.Now.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED);
Field Code = new Field("Code", code, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field TableName = new Field("TableName", tableName, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field Link = new Field("Link", link, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field Source = new Field("Source", source, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field Type = new Field("Type", type, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field Catalog = new Field("Catalog", catalog, Field.Store.YES, Field.Index.NOT_ANALYZED);
//Field Business = new Field("Business", business, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field CompanyNo = new Field("CompanyNo", companyNo, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field PicUrl = new Field("PicUrl", picUrl, Field.Store.YES, Field.Index.NOT_ANALYZED);
Field NoPop = new Field("NoPop", nopop, Field.Store.YES, Field.Index.NOT_ANALYZED);
doc.Add(postid);
doc.Add(title);
doc.Add(postscore);
doc.Add(addTime);
doc.Add(Code);
doc.Add(TableName);
doc.Add(Link);
doc.Add(Source);
doc.Add(Type);
doc.Add(Catalog);
//doc.Add(Business);
doc.Add(CompanyNo);
doc.Add(PicUrl);
doc.Add(NoPop);
writer.AddDocument(doc);
writer.Optimize();
writer.Commit();
}catch(Exception ex)
{
throw ex;
}
finally
{
//关闭锁
writer.Close();
}
}//方法结束
/// <summary>
/// 检索方法
/// </summary>
/// <param name="keyword"></param>
/// <param name="pageIndex"></param>
/// <param name="pageSize"></param>
/// <param name="catalog"></param>
/// <returns></returns>
public List<SearchResult> Search(string keyword,int pageIndex,int pageSize, out int totalCount, string catalog = null)
{
//首先创建文件目录
if (!System.IO.Directory.Exists(IndexDic))
{
System.IO.Directory.CreateDirectory(IndexDic);
}
//判断是否重新创建索引文件
if (!File.Exists(IndexDic + "/segments.gen"))
{
totalCount = 0;
return new List<SearchResult>();
}
if (!string.IsNullOrEmpty(keyword))
{
string[] keywords = keyword.Split(' ');
BooleanQuery boolQuery = new BooleanQuery();
IndexSearcher searcher = new IndexSearcher(IndexDic, true);
Sort sort = new Sort(new SortField("Catalog", SortField.STRING_VAL, false));
List<SearchResult> searchResults = new List<SearchResult>();
foreach (string keywordItem in keywords)
{
if (!string.IsNullOrEmpty(keywordItem))
{
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "DataContent", new PanGuAnalyzer());
Query query = parser.Parse(keywordItem);
//Query query = new TermQuery(new Term("DataContent", keywordItem));
//should的话就是只命中其中一个关键字
//boolQuery.Add(query, BooleanClause.Occur.SHOULD);
//must是全部命中
boolQuery.Add(query, BooleanClause.Occur.MUST);
}
}
if (!string.IsNullOrEmpty(catalog))
{
string[] catalogs = catalog.Split(',');
List<string> allCatalogs = new List<string>() { "1", "2", "3","4"};
List<string> NotContainCatalogs = allCatalogs.Where(x => !catalogs.Contains(x)).ToList();
foreach (string ss in NotContainCatalogs)
{
Query query = new TermQuery(new Term("Catalog", ss));
boolQuery.Add(query, BooleanClause.Occur.MUST_NOT);
}
}
TopDocs docs = searcher.Search(boolQuery, null, pageIndex * pageSize, sort);
//int Count = searcher.MaxDoc();
int Count = docs.totalHits;
totalCount = Count;
if (docs != null && docs.totalHits > 0)
{
for (int i = 0; i < docs.totalHits; i++)
{
if (i >= (pageIndex - 1) * pageSize && i < pageIndex * pageSize)
{
SearchResult searchResult = new SearchResult();
Document doc = searcher.Doc(docs.scoreDocs[i].doc);
searchResult.Id = doc.Get("DataId")?.ToString();
searchResult.Title = doc.Get("Title")?.ToString();
searchResult.Content = doc.Get("DataContent")?.ToString();
searchResult.Code = doc.Get("Code")?.ToString();
searchResult.TableName = doc.Get("TableName")?.ToString();
searchResult.Link = doc.Get("Link")?.ToString();
searchResult.Source = doc.Get("Source")?.ToString();
if (!string.IsNullOrEmpty(doc.Get("AddTime")?.ToString()))
{
searchResult.AddTime = DateTime.Parse(doc.Get("AddTime")?.ToString());
}
if (!string.IsNullOrEmpty(doc.Get("Type")?.ToString()))
{
searchResult.Type = int.Parse(doc.Get("Type")?.ToString());
}
if (!string.IsNullOrEmpty(doc.Get("Catalog")?.ToString()))
{
searchResult.Catalog = int.Parse(doc.Get("Catalog")?.ToString());
}
searchResult.CompanyNo = doc.Get("CompanyNo")?.ToString();
searchResult.PicUrl = doc.Get("PicUrl")?.ToString();
searchResult.NoPop = doc.Get("NoPop")?.ToString();
//高亮显示
searchResult.Content = SimpleHighLighter(searchResult.Content, keyword, "<font style=\"color: red; font - family:\'Cambria\';\"><b>","</b></font>");
searchResult.Title = SimpleHighLighter(searchResult.Title, keyword, "<font style=\"color: red; font - family:\'Cambria\';\"><b>", "</b></font>");
searchResults.Add(searchResult);
}
}
}
return searchResults;
}
else
{
totalCount = 0;
return new List<SearchResult>();
}
}//方法结束
/// <summary>
/// 删除,用于去重二级单位调接口
/// </summary>
/// <param name="id"></param>
/// <param name="business"></param>
public void Delete2Company(string id,string business)
{
//判断是否有数据
if (!File.Exists(IndexDic + "/segments.gen"))
{
return;
}
IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
BooleanQuery boolQuery = new BooleanQuery();
try
{
Query query = new TermQuery(new Term("DataId", id));
Query query1 = new TermQuery(new Term("Code", business));
boolQuery.Add(query, BooleanClause.Occur.MUST);
boolQuery.Add(query1, BooleanClause.Occur.MUST);
writer.DeleteDocuments(boolQuery);
}
catch(Exception ex)
{
throw ex;
}
finally
{
writer.Close();
}
}//方法结束
/// <summary>
/// 去重业务数据
/// </summary>
/// <param name="id"></param>
/// <param name="tableName"></param>
public void Delete(string id, string tableName)
{
//判断是否有数据
if (!File.Exists(IndexDic + "/segments.gen"))
{
return;
}
IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
BooleanQuery boolQuery = new BooleanQuery();
try
{
Query query = new TermQuery(new Term("DataId", id));
Query query1 = new TermQuery(new Term("TableName", tableName));
boolQuery.Add(query, BooleanClause.Occur.MUST);
boolQuery.Add(query1, BooleanClause.Occur.MUST);
writer.DeleteDocuments(boolQuery);
}catch(Exception ex)
{
throw ex;
}
finally
{
writer.Close();
}
}//方法结束
/// <summary>
/// 去重功能
/// </summary>
/// <param name="id"></param>
/// <param name="link"></param>
/// <param name="flag"></param>
public void Delete(string id, string link,bool flag=false)
{
//判断是否有数据
if (!File.Exists(IndexDic + "/segments.gen"))
{
return;
}
IndexWriter writer = new IndexWriter(IndexDic, new PanGuAnalyzer(), false, IndexWriter.MaxFieldLength.LIMITED);
BooleanQuery boolQuery = new BooleanQuery();
try
{
Query query = new TermQuery(new Term("DataId", id));
Query query1 = new TermQuery(new Term("Link", link));
boolQuery.Add(query, BooleanClause.Occur.MUST);
boolQuery.Add(query1, BooleanClause.Occur.MUST);
writer.DeleteDocuments(boolQuery);
}
catch (Exception ex)
{
throw ex;
}
finally
{
writer.Close();
}
}//方法结束
/// <summary>
/// 高亮显示
/// </summary>
/// <param name="p_Body"></param>
/// <param name="p_KeyWords"></param>
/// <param name="p_Before"></param>
/// <param name="p_After"></param>
/// <param name="p_MaxLength"></param>
/// <returns></returns>
public string SimpleHighLighter(string p_Body, string p_KeyWords, string p_Before,
string p_After, int p_MaxLength=0)
{
string[] KeyWords = p_KeyWords.Trim().Split(' ');
for (int i = 0; i < KeyWords.Length; i++)
{
if (!string.IsNullOrEmpty(KeyWords[i]))
{
p_Body = p_Body.Replace(KeyWords[i], p_Before + KeyWords[i] + p_After);
}
}
return p_Body;
}
/// <summary>
/// 利用盘古分词对用户输入的内容进行分词
/// </summary>
/// <param name="str"></param>
/// <returns></returns>
public List<string> GetPanGuWord(string str)
{
List<string> list = new List<string>();
Analyzer analyzer = new PanGuAnalyzer();
TokenStream tokenStream = analyzer.TokenStream("", new StringReader(str));
Lucene.Net.Analysis.Token token = null;
while ((token = tokenStream.Next()) != null)
{
list.Add(token.TermText());
}
return list;
}
}
/// <summary>
/// 搜索结果
/// </summary>
public class SearchResult
{
public string NoPop { get; set; }
public string PicUrl { get; set; }
public string Id { get; set; }
public string Title { get; set; }
public string Content { get; set; }
public DateTime AddTime { get; set; }
public string Code { get; set; }
public string TableName { get; set; }
public string Link { get; set; }
public string Source { get; set; }
public int Type { get; set; }
public int Catalog { get; set; }
public string CompanyNo { get; set; }
}
}