Lucene 5.2.1 + jcseg 1.9.6中文分词索引(Lucene 学习序列2)
jcseg是使用Java开发的一个开源的中文分词器,使用流行的mmseg算法实现。是一款独立的分词组件,不是针对lucene而开发,但是提供了最新版本的lucene和solr分词接口。
Java Code
<span style="font-size:14px;">package com.qiuzhping.lucene;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.lionsoul.jcseg.analyzer.JcsegAnalyzer5X;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
/**
* <Description functions in a word>
* Jcseg[dʒɛ'ke'sɛ]是使用Java开发的一个开源中文分词器,使用流行的mmseg算法实现,<BR>
* 并且提供了最高版本的lucene, solr, elasticsearch(New)的分词接口.<BR>
* 本程序测试的是jcseg 1.9.6,Lucene:5.2.1<BR>
* 关于Jcseg介绍详细请参看 http://www.oschina.net/p/jcseg
* <Detail description>
*
* @author Peter.Qiu
* @version [Version NO, 2015年7月31日]
* @see [Related classes/methods]
* @since [product/module version]
*/
public class LuceneChineseSplit {
public static void main(String[] args) throws Exception {
Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
// 非必须(用于修改默认配置): 获取分词任务配置实例
JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer;
JcsegTaskConfig config = jcseg.getTaskConfig();
// 追加同义词到分词结果中, 需要在 jcseg.properties 中配置 jcseg.loadsyn=1
config.setAppendCJKSyn(true);
// 追加拼音到分词结果中, 需要在 jcseg.properties 中配置 jcseg.loadpinyin=1
config.setAppendCJKPinyin(true);
// 更多配置, 请查看 com.webssky.jcseg.core.JcsegTaskConfig 类
// ====建立索引
// 建立内存索引对象
Directory directory = new RAMDirectory();
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter iwriter = new IndexWriter(directory, iwConfig);
Connection conn = QueryDataFromDb.getConnection();
Statement st = conn.createStatement();
long count = 0;
for(int i = 0 ; i < 10; i ++){
String query = "select * from student limit "+ i * 100000+","+ 100000;
ResultSet result = st.executeQuery(query);
while (result.next()) {
Document document = new Document();
document.add(new StringField("id", result.getString("id"),
Field.Store.YES));
document.add(new TextField("name", result
.getString("name"), Field.Store.YES));
document.add(new StringField("math", result
.getString("math"), Field.Store.YES));
iwriter.addDocument(document);
count ++;
}
}
System.out.println("Total record : "+count);
iwriter.commit();
iwriter.close();
// ==搜索
IndexReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
String keyword = "你好";
// 使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser("name", analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
long start = System.currentTimeMillis();
//搜索相似度最高的2条记录
System.out.println("搜索相似度最高的2条记录");
TopDocs topDocs = isearcher.search(query, 2);
System.out.println("命中:" + topDocs.totalHits);
for (ScoreDoc sd : topDocs.scoreDocs) {
Document doc = isearcher.doc(sd.doc);
System.out.println("id:" + doc.get("id"));
System.out.println("name:" + doc.get("name"));
System.out.println("math:" + doc.get("math"));
}
System.out.println("Spend time:"+(System.currentTimeMillis() - start) + " ms");
}
}
</span>
测试结果:
Total record : 1000000
Query = name:你好
搜索相似度最高的2条记录
命中:1000000
id:1
name:你好
math:38
id:2
name:你好
math:21
Spend time:52 ms
代码片段涉及到:
lucene-analyzers-common-5.2.1.jar
lucene-core-5.2.1.jar
lucene-queryparser-5.2.1.jar
mysql-connector-java-5.1.35.jar
jcseg-analyzer-1.9.6.jar
jcseg-core-1.9.6.jar