R语言词云和中文词典包。
library(jiebaR)
cutter = worker() #新建分词器
segment( "This is a good day!" , cutter ) #分词
## [1] "This" "is" "a" "good" "day"
1. worker()
worker(type = "mix", #type指定返回词的结果类型
dict = DICTPATH, #dict系统词典路径
hmm = HMMPATH,
user = USERPATH, #user用户词典
idf = IDFPATH, #IDF词典,关键词提取使用
stop_word = STOPPATH, #stop_word停止词库,默认空值
qmax = 20,
topn = 5, #topn关键词数
encoding = "UTF-8", #输入文件编码
detect = T,
symbol = F, #是否保留符号
lines = 1e+05, #每次读取文件的最大行数,对于大文件,实现分次读取。
write = T, #是否将分词结果写入文件
output = NULL, #输出路径
bylines = F)) #按行输出
参数
user=cidian::load_user_dict(filePath="词库路径")
2. 更新分词器
cutter=worker() #初始化分词器
cutter$bylines=TRUE #分行输出(接收向量分词后,分解到list中输出)
cutter$symbol = FALSE #重设为不保留符号
3. 添加新词到已存在的分词器中
new_user_word(worker, words, tags = rep("n", length(words)))
segment(code, jiebar, mod = NULL)
参数:
tagging(code, jiebar)
分词+标记
vector_tag(string, jiebar)
对已经分好的词进行标记
words = "我爱北京天安门"
cutter = worker()
result = segment(words, cutter)
tagger = worker("tag") #标记器
vector_tag(result, tagger) #对分好的词进行标记
#> r v ns ns
#> "我" "爱" "北京" "天安门"
keywords(code, jiebar)
提取关键词
vector_keywords(segment, jiebar)
key = worker("keywords", topn = 1)
keywords("我爱北京天安门", key)
#> 8.9954
#> "天安门"
# 对已经分好词的文本提取关键词
cutter = worker()
result = segment("我爱北京天安门", cutter)
vector_keywords(result, key)
#> 8.9954
#> "天安门"
simhasher = worker("simhash", topn=2)
simhash("江州市长江大桥参加了长江大桥的通车仪式", simhasher)
distance("hello world!", "江州市长江大桥参加了长江大桥的通车仪式", simhasher) #分词+计算距离
vector_simhash(c("今天","天气","真的","十分","不错","的","感觉"), simhasher)
vector_distance(c("今天","天气","真的","十分","不错","的","感觉"),
c("今天","天气","真的","十分","不错","的","感觉"), simhasher) #计算距离
freq(x)
x为分词后的结果
用来转换搜狗细胞词库
Windows 安装 RTools,设置好对应的环境变量
library(devtools)
install_github("qinwf/cidian")
decode_scel(scel = "细胞词库路径",output = "输出文件路径")
load_user_dict(filePath = "用户词典路径", default_tag = "默认标记")## 读取用户词典
load_sys_dict(filePath = "系统词典路径")## 读取系统词典
add_user_words(dict = "load_user_dict 读取的词典", words = "UTF-8 编码文本向量", tags = "标记")## 增加用户词典词
add_sys_words(dict = "load_sys_dict 读取的词典", words = "UTF-8 编码文本向量", freq = "词频", tags = "标记")## 增加系统词典词
remove_words(dict = "load_user_dict 或 load_sys_dict 读取的词典", words = "UTF-8 编码文本向量")## 删除词典词
write_dict(dict = "load_user_dict 或 load_sys_dict 读取的词典", output = "输出路径")## 写入
(userd = load_user_dict(jiebaR::USERPATH))
userd = add_user_words(userd, enc2utf8("测试"), "v")
write_dict(userd, jiebaR::USERPATH)
(userd = load_user_dict(jiebaR::USERPATH))
userd = remove_words(userd, enc2utf8(c("测试","蓝翔")))
write_dict(userd, jiebaR::USERPATH)
(userd = load_user_dict(jiebaR::USERPATH))