最近,工作上用到中文分词ansj,先前我是整合到ES里面,但是觉得这样不利于源码分析,所以我们先把源码部署起来:
在线演示:[url]http://ansj.sdapp.cn/demo/seg.jsp [/url]
官网地址:[url]http://www.ansj.org/ [/url]
github:https://github.com/NLPchina/ansj_seg
通过maven引入源码,这里不再赘述。得到结构图如下:
[img]http://dl2.iteye.com/upload/attachment/0104/2441/9271e7bf-9ecc-3103-a50a-9bf6e0d954bf.png[/img]
我们可以发现library.properties就是用来配置词典的,最开始配置如下:
#redress dic file path
ambiguityLibrary=library/ambiguity.dic
#path of userLibrary this is default library
userLibrary=library/default.dic
#set real name
isRealName=true
添加一个词典文件,得到如下所示:
#redress dic file path
ambiguityLibrary=library/ambiguity.dic
#path of defultLibrary this is default library
defaultLibrary=library/default.dic
#path of userLibrary this is user library
userLibrary=library/userLibrary.dic
#set real name
isRealName=true
个人偏好,把原有的userLibrary改成defaultLibrary,因为我觉得用户自定义词库,可以暂时定义,加入分词,后期维护可以加入默认词库,这样就有了一个升级过程。
把新加的词库读入内存,只修改如下代码:
/**
* 加载用户自定义词典和补充词典
*/
private static void initUserLibrary() {
// TODO Auto-generated method stub
try {
FOREST = new Forest();
// 加载用户自定义词典
String userLibrary = MyStaticValue.userLibrary;
loadLibrary(FOREST, userLibrary);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
为:
/**
* 加载用户自定义词典和补充词典
*/
private static void initUserLibrary() {
// TODO Auto-generated method stub
try {
FOREST = new Forest();
// 加载默认自定义词典
String defaultLibrary = MyStaticValue.defaultLibrary;
loadLibrary(FOREST, defaultLibrary);
//加载用户新增词典
String userLibrary = MyStaticValue.userLibrary;
loadLibrary(FOREST, userLibrary);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
这里我没有加上类名,是我希望读者自己能够根据debug找到相应的类,还请谅解。
另外,我再把停用词也指出一下:
[img]http://dl2.iteye.com/upload/attachment/0104/2478/24612555-7fab-344f-8555-605015817f32.png[/img]
通过FilterModifWord类调用。
需要修改一下源码:
package org.ansj.util;
import static org.ansj.util.MyStaticValue.LIBRARYLOG;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.ansj.domain.Nature;
import org.ansj.domain.Term;
import org.ansj.library.UserDefineLibrary;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
/*
* 停用词过滤,修正词性到用户词性.
*/
public class FilterModifWord {
private static Set<String> FILTER = new HashSet<String>();
private static String TAG = "#";
private static boolean isTag = false;
static{
String filePath = MyStaticValue.stopWordsLibrary;
initStopWordsDic(filePath);
}
/**
* 初始化停用词词库
* @param stopWordsPath
*/
private static void initStopWordsDic(String stopWordsPath){
File file = null;
if (StringUtil.isNotBlank(stopWordsPath)) {
file = new File(stopWordsPath);
if (!file.canRead() || file.isHidden()) {
LIBRARYLOG.warning("init stopWordsLibrary warning :" + new File(stopWordsPath).getAbsolutePath() + " because : file not found or failed to read !");
return;
}
if (file.isFile()) {
loadStopWordsFile(file);
} else if (file.isDirectory()) {
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].getName().trim().endsWith(".dic")) {
loadStopWordsFile(files[i]);
}
}
} else {
LIBRARYLOG.warning("init stopWordsLibrary error :" + new File(stopWordsPath).getAbsolutePath() + " because : not find that file !");
}
}
}
/**
* 加载停用词文件
* @param filePath
*/
private static void loadStopWordsFile(File file){
if (!file.canRead()) {
LIBRARYLOG.warning("file in path " + file.getAbsolutePath() + " can not to read!");
return;
}
String temp = null;
BufferedReader br = null;
String[] strs = null;
try {
br = IOUtil.getReader(new FileInputStream(file), "UTF-8");
while ((temp = br.readLine()) != null) {
if (StringUtil.isBlank(temp)) {
continue;
} else {
insertStopWord(temp);
}
}
LIBRARYLOG.info("init stopWordsLibrary ok path is : " + file.getAbsolutePath());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtil.close(br);
br = null;
}
}
public static void insertStopWords(List<String> filterWords) {
FILTER.addAll(filterWords);
}
public static void insertStopWord(String... filterWord) {
for (String word : filterWord) {
FILTER.add(word);
}
}
public static void insertStopNatures(String... filterNatures) {
isTag = true;
for (String natureStr : filterNatures) {
FILTER.add(TAG + natureStr);
}
}
/*
* 停用词过滤并且修正词性
*/
public static List<Term> modifResult(List<Term> all) {
List<Term> result = new ArrayList<Term>();
try {
for (Term term : all) {
if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || (isTag && FILTER.contains(TAG + term.natrue().natureStr)))) {
continue;
}
String[] params = UserDefineLibrary.getParams(term.getName());
if (params != null) {
term.setNature(new Nature(params[0]));
}
result.add(term);
}
} catch (Exception e) {
// TODO Auto-generated catch block
System.err.println("FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map");
}
return result;
}
/*
* 停用词过滤并且修正词性
*/
public static List<Term> modifResult(List<Term> all, Forest... forests) {
List<Term> result = new ArrayList<Term>();
try {
for (Term term : all) {
if (FILTER.size() > 0 && (FILTER.contains(term.getName()) || FILTER.contains(TAG + term.natrue().natureStr))) {
continue;
}
for (Forest forest : forests) {
String[] params = UserDefineLibrary.getParams(forest, term.getName());
if (params != null) {
term.setNature(new Nature(params[0]));
}
}
result.add(term);
}
} catch (Exception e) {
// TODO Auto-generated catch block
System.err.println("FilterStopWord.updateDic can not be null , " + "you must use set FilterStopWord.setUpdateDic(map) or use method set map");
}
return result;
}
}
package org.ansj.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.ResourceBundle;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Logger;
import org.ansj.app.crf.Model;
import org.ansj.app.crf.SplitWord;
import org.ansj.dic.DicReader;
import org.ansj.domain.AnsjItem;
import org.ansj.library.DATDictionary;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
/**
* 这个类储存一些公用变量.
*
* @author ansj
*
*/
public class MyStaticValue {
public static final Logger LIBRARYLOG = Logger.getLogger("DICLOG");
// 是否开启人名识别
public static boolean isNameRecognition = true;
private static final Lock LOCK = new ReentrantLock();
// 是否开启数字识别
public static boolean isNumRecognition = true;
// 是否数字和量词合并
public static boolean isQuantifierRecognition = true;
// crf 模型
private static SplitWord crfSplitWord = null;
public static boolean isRealName = false;
/**
* 用户自定义词典的加载,如果是路径就扫描路径下的dic文件
*/
public static String defaultLibrary = "library/default.dic";
public static String ambiguityLibrary = "library/ambiguity.dic";
public static String userLibrary = "library/userLibrary.dic";
public static String stopWordsLibrary = "src/main/resources/newWord/newWordFilter.dic";
/**
* 是否用户辞典不加载相同的词
*/
public static boolean isSkipUserDefine = false;
static {
/**
* 配置文件变量
*/
try {
ResourceBundle rb = ResourceBundle.getBundle("library");
if (rb.containsKey("defaultLibrary"))
defaultLibrary = rb.getString("defaultLibrary");
if (rb.containsKey("ambiguityLibrary"))
ambiguityLibrary = rb.getString("ambiguityLibrary");
if (rb.containsKey("userLiberary"))
userLibrary = rb.getString("userLibrary");
if (rb.containsKey("stopWordsLibrary"))
stopWordsLibrary = rb.getString("stopWordsLibrary");
if (rb.containsKey("isSkipUserDefine"))
isSkipUserDefine = Boolean.valueOf(rb.getString("isSkipUserDefine"));
if (rb.containsKey("isRealName"))
isRealName = Boolean.valueOf(rb.getString("isRealName"));
} catch (Exception e) {
LIBRARYLOG.warning("not find library.properties in classpath use it by default !");
}
}
/**
* 人名词典
*
* @return
*/
public static BufferedReader getPersonReader() {
return DicReader.getReader("person/person.dic");
}
/**
* 机构名词典
*
* @return
*/
public static BufferedReader getCompanReader() {
return DicReader.getReader("company/company.data");
}
/**
* 机构名词典
*
* @return
*/
public static BufferedReader getNewWordReader() {
return DicReader.getReader("newWord/new_word_freq.dic");
}
/**
* 核心词典
*
* @return
*/
public static BufferedReader getArraysReader() {
// TODO Auto-generated method stub
return DicReader.getReader("arrays.dic");
}
/**
* 数字词典
*
* @return
*/
public static BufferedReader getNumberReader() {
// TODO Auto-generated method stub
return DicReader.getReader("numberLibrary.dic");
}
/**
* 英文词典
*
* @return
*/
public static BufferedReader getEnglishReader() {
// TODO Auto-generated method stub
return DicReader.getReader("englishLibrary.dic");
}
/**
* 词性表
*
* @return
*/
public static BufferedReader getNatureMapReader() {
// TODO Auto-generated method stub
return DicReader.getReader("nature/nature.map");
}
/**
* 词性关联表
*
* @return
*/
public static BufferedReader getNatureTableReader() {
// TODO Auto-generated method stub
return DicReader.getReader("nature/nature.table");
}
/**
* 得道姓名单字的词频词典
*
* @return
*/
public static BufferedReader getPersonFreqReader() {
// TODO Auto-generated method stub
return DicReader.getReader("person/name_freq.dic");
}
/**
* 名字词性对象反序列化
*
* @return
*/
@SuppressWarnings("unchecked")
public static Map<String, int[][]> getPersonFreqMap() {
InputStream inputStream = null;
ObjectInputStream objectInputStream = null;
Map<String, int[][]> map = new HashMap<String, int[][]>(0);
try {
inputStream = DicReader.getInputStream("person/asian_name_freq.data");
objectInputStream = new ObjectInputStream(inputStream);
map = (Map<String, int[][]>) objectInputStream.readObject();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (objectInputStream != null)
objectInputStream.close();
if (inputStream != null)
inputStream.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return map;
}
/**
* 词与词之间的关联表数据
*
* @return
*/
public static void initBigramTables() {
BufferedReader reader = null;
try {
reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8");
String temp = null;
String[] strs = null;
int freq = 0;
while ((temp = reader.readLine()) != null) {
if (StringUtil.isBlank(temp)) {
continue;
}
strs = temp.split("\t");
freq = Integer.parseInt(strs[1]);
strs = strs[0].split("@");
AnsjItem fromItem = DATDictionary.getItem(strs[0]);
AnsjItem toItem = DATDictionary.getItem(strs[1]);
if (fromItem == AnsjItem.NULL && strs[0].contains("#")) {
fromItem = AnsjItem.BEGIN;
}
if (toItem == AnsjItem.NULL && strs[1].contains("#")) {
toItem = AnsjItem.END;
}
if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) {
continue;
}
if(fromItem.bigramEntryMap==null){
fromItem.bigramEntryMap = new HashMap<Integer, Integer>() ;
}
fromItem.bigramEntryMap.put(toItem.index, freq) ;
}
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
IOUtil.close(reader);
}
}
/**
* 得到默认的模型
*
* @return
*/
public static SplitWord getCRFSplitWord() {
// TODO Auto-generated method stub
if (crfSplitWord != null) {
return crfSplitWord;
}
LOCK.lock();
if (crfSplitWord != null) {
return crfSplitWord;
}
try {
long start = System.currentTimeMillis();
LIBRARYLOG.info("begin init crf model!");
crfSplitWord = new SplitWord(Model.loadModel(DicReader.getInputStream("crf/crf.model")));
LIBRARYLOG.info("load crf crf use time:" + (System.currentTimeMillis() - start));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
LOCK.unlock();
}
return crfSplitWord;
}
}
测试用例:
package org.ansj.demo;
import java.util.List;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.util.FilterModifWord;
public class StopWordDemo {
public static void main(String[] args) {
// FilterModifWord.insertStopWord("五一");
List<Term> parseResultList = NlpAnalysis.parse("your五一,劳动节快乐");
System.out.println(parseResultList);
parseResultList = FilterModifWord.modifResult(parseResultList);
System.out.println(parseResultList);
}
}
程序猿行业技术生活交流群:181287753(指尖天下),欢迎大伙加入交流学习。