indexReader.docFreq(new Term(FIELD, “中国”))
indexReader.maxDoc()
Terms terms = indexReader.getTermVector(docID, TEXT_FIELD);
TermsEnum termsEnum = terms.iterator(null);
BytesRef thisTerm = null;
Map<String, Integer>textTf = new HashMap<String, Integer>();
while ((thisTerm = termsEnum.next()) != null) {
String termText = thisTerm.utf8ToString();
DocsEnum docsEnum = termsEnum.docs(null, null);
while ((docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
// System.out.println("termText:"+termText+" TF: "+docsEnum.freq());
textTf.put(termText, docsEnum.freq());
}
}
按词频排序
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(textTf.entrySet()); //转换为list
list.sort(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
for (int i = 0; i < list.size(); i++) {
System.out.println(list.get(i).getKey() + ": " + list.get(i).getValue());
}
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
JiebaSegmenter segmenter = new JiebaSegmenter();
System.out.println(segmenter.sentenceProcess("中华人民共和国"));
for (SegToken i: segmenter.process("中华人民共和国", JiebaSegmenter.SegMode.SEARCH)){
System.out.println(i.word);
}
Search模式,用于对用户查询词分词
Index模式,用于对索引文档分词
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.1</version>
</dependency>
HanLP.Config.ShowTermNature = false; //关闭词性显示
List<Term> termList = HanLP.segment(string);
for(int i = 0;i < termList.size(); i++){
TermList.add(termList.get(i).toString());
}
TermList.removeAll(stopword);
分词'/'.join(jieba.cut(sentence))
关键词提取jieba.analyse.extract_tags(text, topK=10, withWeight=True)
去除停用词jieba.analyse.set_stop_words("./StopWords.txt") jieba.analyse.extract_tags(text, topK=10, withWeight=True)
可以利用sklearn、gensim、jieba,下面是sklearn的方式
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
corpus = ["我 来到 北京 清华大学",
"他 来到 了 网易 杭研 大厦",
"小明 硕士 毕业 与 中国 科学院",
"我 爱 北京 天安门"]
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
count = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(count)
print vectorizer.vocabulary_
print " ".join([k for k, v in vectorizer.vocabulary_.items()])
print count.toarray()
print tfidf.toarray()
tfidf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_df=0.6, stop_words=["我"])
tfidf_matrix = tfidf_vec.fit_transform(corpus)
print tfidf_vec.vocabulary_
num_word = {v:k for k, v in tfidf_vec.vocabulary_.items()}
print " ".join([k for k, v in tfidf_vec.vocabulary_.items()])
for i in tfidf_matrix.toarray():
for j in range(len(i)):
if i[j] != 0.0:
print num_word[j], i[j]