library(jiebaR)
library(sqldf)
TA = read.csv('R/table-A.csv', header = TRUE, sep = ",")
txtdf = TA$BAK_TXT
TA$BAK_TXT <- as.character(TA$BAK_TXT)
wkr = worker()
# vector
words = c()
for( txt in txtdf ){
# add new segment into words
words <- c(words,
segment(txt, wkr) )
}
#jieba functions
freqrs <- freq(words)
# sort and count
rs <- table(words)
# convert to data frame
rsdf <- as.data.frame(rs)
rsdf$words <- as.character(rsdf$words)
lowChar <- grep("[a-z]", rsdf$words)
upperChar <- grep("[A-Z]", rsdf$words)
numbers <- grep("[0-9]", rsdf$words)
# check encoding
#Encoding( rsdf$words )
rowNums <- c(lowChar, upperChar, numbers)
# delete duplicate row numbers
rowNums <- unique(rowNums)
# selelct none char/number rows
chrs <- rsdf[-rowNums,]
# check the length of string
#nchar(chrs$words)
nwords <- dim(chrs)[1]
nrow = dim(TA)[1]
for( i in 1:nwords ){
word <- chrs$words[i]
wordCols <- character()
for( j in 1:nrow ){
flg <-
grepl(word, TA$BAK_TXT[j])
if( flg == TRUE ){
wordCols <- c( wordCols, "Y" )
}else{
wordCols <- c( wordCols, "N" )
}
}
wordCols <-
as.data.frame(wordCols)
names(wordCols) <- word
TA <-
cbind(TA, wordCols)
}
write.csv(TA, file = "rs-words.csv")
|