对于一个软件来讲,若是开源其发展速度是很快的,在R软件中,去年年底就发布了jiebaR分词包,记得上学的那会jieba包首先是出现在python中,没想到在R软件中也用的到,前几天接了点私活,用这个包帮他做点东西出来,没想到,做到最后不愿意给钱,无良奸商。。。不过也正好也熟悉了一下R中的jiebaR分词包,总体来讲这个包还是蛮强大的,中文分词很准确,能提取关键字,能快速的上手,直接上代码对比python中jieba包,看看吧:
library(jiebaRD)
library(jiebaR)
# library(jiebaR)加载包时没有启动任何分词引擎启动引擎很简单,就是一句赋值语句就可以了
x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利"
cutter=worker()
cutter<=x
show_dictpath() #显示字典路径
edit_dict() #编辑用户字典 可以自己加词语进去 要重新cutter=worker()才能生效
cutter <= "D:\\Users\\xuguoxuan694\\Desktop\\新建文本文档.txt"
show_dictpath() ### 可以显示默认词典路径
segment(code= x , jiebar = cutter) ##一样的功能
cutter=worker(type="tag")<pre name="code" class="plain">tryCatch(library(jiebaR),error=function(e){
install.packages("jiebaR")
},finally={
tryCatch(library(jiebaR)
,error=function(e){print("请重新安装jiebaR包")})
}
)
tryCatch(library(ggplot2),error=function(e){
install.packages("ggplot2")
},finally={
tryCatch(library(ggplot2)
,error=function(e){print("请重新安装ggplot2包")})
}
)
tryCatch(library(wordcloud),error=function(e){
install.packages("wordcloud")
},finally={
tryCatch(library(wordcloud)
,error=function(e){print("请重新安装wordcloud包")})
}
)
result<-read.csv("C:\\Users\\Administrator\\Desktop\\质量记录.csv")
head(result)
# edit_dict()
cutter=worker()
x<-"众筹项目成功了,众筹绑卡成功了,一切都很顺利"
cutter<=x
result$QUALITYDESC_d<-sapply(result$QUALITYDESC,function(x)gsub("[a-zA-Z0-9]","",x))
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户反应","",x))
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("客户","",x))
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,function(x)gsub("反映","",x))
#把全身没气压、全身气压弱、全身气压很小转化为全身气压问题
clear_stopwords<-function(x){
if(grepl("全身没气压",x)){
x<-gsub("全身没气压","全身气压",x)
}
if(grepl("全身不充气",x)){
x<-gsub("全身不充气","全身充气",x)
}
if(grepl("断了",x)){
x<-gsub("断了","断裂",x)
}
if(grepl("响声",x)){
x<-gsub("响声","异响",x)}
x
}
result$QUALITYDESC_d<-sapply(result$QUALITYDESC_d,clear_stopwords)
jieba_result<-c()
for(j in result$QUALITYDESC_d){
jieba_result<-c(jieba_result,cutter<=j)
}
stopwords1<-c("不能","不","了","有","在","没","少","一个","都","也","时","来","用","会","上","后","是","腿","走","无","左","大","没有","就","到","右","坏"," 部","不会"," 两个")
stopword2<-c(stopwords1,"加","一","小","个","才","去","能","对","只","还","和","需要","过","倒","的","跟","已","掉","让","可以","掉","停","拨","亮","一下","下")
stopword<-c(stopword2,"其他","下去","时候","使用","问题","正常","部","一边","一直","工作","响","说","好","买","但是","一样","不行","时有","夹")
jjj_result<-as.data.frame(table(jieba_result))
jj_result<-jjj_result[!jjj_result$jieba_result %in% stopword, ]
op<-par(bg = "lightyellow")
wordcloud(jj_result$jieba_result,jj_result$Freq,col = rainbow(length(jj_result$Freq)),scale=c(5,1),min.freq=4,max.words=Inf,random.order=FALSE)
par(op)
last<-jj_result[order(jj_result$Freq),]
p<-ggplot(tail(last,30),aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)
p+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+
xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30名分析报告分析")+ geom_text(label=(tail(last,30))$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))
p1<-ggplot( last[(nrow(last)-60):(nrow(last)-30),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)
p1+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+
xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前30到60名分析报告分析")+ geom_text(label=(last[(nrow(last)-60):(nrow(last)-30),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))
p2<-ggplot( last[(nrow(last)-90):(nrow(last)-60),],aes(x=reorder(jieba_result,Freq),y=Freq))+geom_bar(stat="identity",fill ="blue",width=0.5)+geom_text(label="",colour = "red", vjust=-1)
p2+theme(axis.text.x=element_text(angle=90,colour="black"))+scale_fill_manual(values=c("green","red"))+coord_flip()+theme(panel.background = element_rect(fill = "transparent", color = "gray"))+
xlab("分词词汇")+ylab("出现频率")+ggtitle("天凰国际按摩椅质检报告频率最高前60到90名分析报告分析")+geom_text(label=(last[(nrow(last)-90):(nrow(last)-60),])$Freq,colour = "red", hjust=0, vjust=0.5,aes(colour=factor(jieba_result)))
https://qinwf.shinyapps.io/jiebaR-shiny/ jiebaR在线分词试用
附上python中关键词提取及LDA模型 的python代码
#encoding:utf-8
'''
Created on 2015年10月25日
@author: Administrator
'''
import pandas as pd
import re
import jieba
import nltk
import jieba.posseg as pseg
from gensim import corpora, models, similarities
df=pd.read_csv(u'C:\\Users\\Administrator\\Desktop\\质量记录.csv',encoding='gbk')
cont=df['QUALITYDESC'].map(lambda x:re.sub(ur'客户反应|客户|反映','',x)).map(lambda x:re.sub(r'[a-zA-Z0-9\.]','',x))
#导入自己添加的用户词语
jieba.load_userdict(u'C:\\Users\\Administrator\\Desktop\\分词.txt')
nwordall = []
for t in cont:
words =pseg.cut(t)
nword = ['']
for w in words:
if((w.flag == 'n'or w.flag == 'v' or w.flag == 'a') and len(w.word)>1):
nword.append(w.word)
nwordall.append(nword)
# 选择后的词生成字典
dictionary = corpora.Dictionary(nwordall)#用于生成字典类似与table,Counter模块中count
#print dictionary.token2id
# 生成语料库
corpus = [dictionary.doc2bow(text) for text in nwordall]
#tfidf加权
tfidf = models.TfidfModel(corpus)
# print tfidf.dfsx
# print tfidf.idf
corpus_tfidf = tfidf[corpus]
# 4. 主题模型lda,可用于降维
#lda流式数据建模计算,每块10000条记录,提取50个主题
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50, update_every=1, chunksize=10000, passes=1)
#提取前面20个主题
for i in range(0,20):
print lda.print_topics(i)[0]
#lda全部数据建模,提取100个主题
#lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=100, update_every=0, passes=20)
#利用原模型预测新文本主题
#doc_lda = lda[corpus_tfidf]
#5. word2vec 词向量化,可用于比较词相似度,寻找对应关系,词聚类
#sentences = models.word2vec.LineSentence(nwordall)
#size为词向量维度数,windows窗口范围,min_count频数小于5的词忽略,workers是线程数
model = models.word2vec.Word2Vec(nwordall, size=100, window=5, min_count=5, workers=4)
print model[u'指示灯']
#向量表示
sim = model.most_similar(positive=[u'指示灯', u'灯不亮'])
#相近词
for s in sim:
print "word:%s,similar:%s " %(s[0],s[1])