GitHub地址:https://github.com/yanyiwu/cppjieba
CppJieba是"结巴(Jieba)"中文分词的C++版本
git clone https://github.com/yanyiwu/cppjieba.git
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba$ ls
appveyor.yml build ChangeLog.md CMakeLists.txt deps dict include README_EN.md README.md test
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba$ cd include/
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba/include$ tree
.
└── cppjieba
├── DictTrie.hpp
├── FullSegment.hpp
├── HMMModel.hpp
├── HMMSegment.hpp
├── Jieba.hpp
├── KeywordExtractor.hpp
├── limonp
│ ├── ArgvContext.hpp
│ ├── BlockingQueue.hpp
│ ├── BoundedBlockingQueue.hpp
│ ├── BoundedQueue.hpp
│ ├── Closure.hpp
│ ├── Colors.hpp
│ ├── Condition.hpp
│ ├── Config.hpp
│ ├── FileLock.hpp
│ ├── ForcePublic.hpp
│ ├── LocalVector.hpp
│ ├── Logging.hpp
│ ├── Md5.hpp
│ ├── MutexLock.hpp
│ ├── NonCopyable.hpp
│ ├── StdExtension.hpp
│ ├── StringUtil.hpp
│ ├── Thread.hpp
│ └── ThreadPool.hpp
├── MixSegment.hpp
├── MPSegment.hpp
├── PosTagger.hpp
├── PreFilter.hpp
├── QuerySegment.hpp
├── SegmentBase.hpp
├── SegmentTagged.hpp
├── TextRankExtractor.hpp
├── Trie.hpp
└── Unicode.hpp
2 directories, 35 files
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba$ ls
appveyor.yml build ChangeLog.md CMakeLists.txt deps dict include README_EN.md README.md test
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba$ cd test
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba/test$ ls
CMakeLists.txt demo.cpp load_test.cpp testdata unittest
###重要!
1 #include "../include/cppjieba/Jieba.hpp"
2
3 using namespace std;
4
5 const char* const DICT_PATH = "../dict/jieba.dict.utf8";
6 const char* const HMM_PATH = "../dict/hmm_model.utf8";
7 const char* const USER_DICT_PATH = "../dict/user.dict.utf8";
8 const char* const IDF_PATH = "../dict/idf.utf8";
9 const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
10
✹ 11 int main(int argc, char** argv) {
12 cppjieba::Jieba jieba(DICT_PATH,
13 HMM_PATH,
14 USER_DICT_PATH,
15 IDF_PATH,
16 STOP_WORD_PATH);
17 vector<string> words;
18 vector<cppjieba::Word> jiebawords;
19 string s;
20 string result;
21
22 s = "他来到了网易杭研大厦";
23 cout << s << endl;
24 cout << "[demo] Cut With HMM" << endl;
25 jieba.Cut(s, words, true);
26 cout << limonp::Join(words.begin(), words.end(), "/") << endl;
27
28 cout << "[demo] Cut Without HMM " << endl;
29 jieba.Cut(s, words, false);
30 cout << limonp::Join(words.begin(), words.end(), "/") << endl;
s = "我来到北京清华大学";
33 cout << s << endl;
34 cout << "[demo] CutAll" << endl;
35 jieba.CutAll(s, words);
36 cout << limonp::Join(words.begin(), words.end(), "/") << endl;
37
38 s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
39 cout << s << endl;
40 cout << "[demo] CutForSearch" << endl;
41 jieba.CutForSearch(s, words);
42 cout << limonp::Join(words.begin(), words.end(), "/") << endl;
43
44 cout << "[demo] Insert User Word" << endl;
45 jieba.Cut("男默女泪", words);
46 cout << limonp::Join(words.begin(), words.end(), "/") << endl;
47 jieba.InsertUserWord("男默女泪");
48 jieba.Cut("男默女泪", words);
49 cout << limonp::Join(words.begin(), words.end(), "/") << endl;
50
51 cout << "[demo] CutForSearch Word With Offset" << endl;
52 jieba.CutForSearch(s, jiebawords, true);
53 cout << jiebawords << endl;
54
55 cout << "[demo] Lookup Tag for Single Token" << endl;
56 const int DemoTokenMaxLen = 32;
57 char DemoTokens[][DemoTokenMaxLen] = {"拖拉机", "CEO", "123", "。"};
58 vector<pair<string, string> > LookupTagres(sizeof(DemoTokens) / DemoTokenMaxLen);
59 vector<pair<string, string> >::iterator it;
60 for (it = LookupTagres.begin(); it != LookupTagres.end(); it++) {
61 it->first = DemoTokens[it - LookupTagres.begin()];
62 it->second = jieba.LookupTag(it->first);
63 }
64 cout << LookupTagres << endl;
65
66 cout << "[demo] Tagging" << endl;
67 vector<pair<string, string> > tagres;
68 s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
69 jieba.Tag(s, tagres);
70 cout << s << endl;
71 cout << tagres << endl;
72
73 cout << "[demo] Keyword Extraction" << endl;
74 const size_t topk = 5;
75 vector<cppjieba::KeywordExtractor::Word> keywordres;
76 jieba.extractor.Extract(s, keywordres, topk);
77 cout << s << endl;
78 cout << keywordres << endl;
79 return EXIT_SUCCESS;
80 }
jieba.cut(s,words,true)
s:需要分词的目标字符串
words:将分好词的数据放到words里面
true:是否使用HMM算法
从网页数据传进来,在words里面进行统计
dict是cppjieba库的库文件,进入dict目录,文件后缀名代表的是词典的编码方式。比如filename.utf8 是 utf8编 filename.gbk 是 gbk编码方式。
guanhj@guanhj-virtual-machine:~/cppjb/cppjieba/dict$ ls
hmm_model.utf8 idf.utf8 jieba.dict.utf8 pos_dict README.md stop_words.utf8 user.dict.utf8
jieba.dict.utf8:词典文件,分词的时候会用到
4 C# 3 nz
5 c++ 3 nz
6 C++ 3 nz
7 T恤 4 n
8 一 217830 m
9 一一 1670 m
10 一一二 11 m
11 一一例 3 m
12 一一分 8 m
13 一一列举 34 i
14 一一对 9 m
15 一一对应 43 l
16 一一记 2 m
17 一一道来 4 l
18 一丁 18 d
19 一丁不识 3 i
20 一丁点 3 m
21 一丁点儿 24 m
22 一七 22 m
23 一七八不 3 l
24 一万 442 m
25 一万一千 4 m
26 一万一千五百二十颗 2 m
27 一万一千八百八十斤 2 m
28 一万一千多间 2 m
idf.utf8:权重
1 劳动防护 13.900677652
2 生化学 13.900677652
3 奥萨贝尔 13.900677652
4 考察队员 13.900677652
5 岗上 11.5027823792
6 倒车档 12.2912397395
7 编译 9.21854642485
8 蝶泳 11.1926274509
9 外委 11.8212361103
10 故作高深 11.9547675029
11 尉遂成 13.2075304714
12 心源性 11.1926274509
13 现役军人 10.642581114
14 杜勃留 13.2075304714
15 包天笑 13.900677652
16 贾政陪 13.2075304714
17 托尔湾 13.900677652
18 多瓦 12.5143832909
19 多瓣 13.900677652
20 巴斯特尔 11.598092559
21 刘皇帝 12.8020653633
22 亚历山德罗夫 13.2075304714
23 社会公众 8.90346537821
24 五百份 12.8020653633
25 两点阈 12.5143832909
26 多瓶 13.900677652
27 冰天 12.2912397395
28 库布齐 11.598092559
hmm_model.utf8 :用到的相关的算法
(作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典)
#E:-3.14e+100
9 #M:-3.14e+100
10 #S:-1.4652633398537678
11 #prob_start
12 -0.26268660809250016 -3.14e+100 -3.14e+100 -1.4652633398537678
13 #prob_trans 4x4 matrix
14 -3.14e+100 -0.510825623765990 -0.916290731874155 -3.14e+100
15 -0.5897149736854513 -3.14e+100 -3.14e+100 -0.8085250474669937
16 -3.14e+100 -0.33344856811948514 -1.2603623820268226 -3.14e+100
17 -0.7211965654669841 -3.14e+100 -3.14e+100 -0.6658631448798212
18 #B
19 #E:-0.510825623765990,M:-0.916290731874155
20 #E
21 #B:-0.5897149736854513,S:-0.8085250474669937
22 #M
23 #E:-0.33344856811948514,M:-1.2603623820268226
24 #S
25 #B:-0.7211965654669841,S:-0.6658631448798212
26 #prob_emit 4 lines
27 #B
28 耀:-10.460283,蘄:-11.015514,涉:-8.766406,谈:-8.039065,伊:-7.682602,預:-7.177710,亟:-11.905136,洞:-8.668696,咣:-13.327149,猢:-12.749514,礦:-7.891761,鶩
29 #E
stop_words.utf8 :分词发现这些信息时剔除掉
1 "
2 .
3 。
4 ,
5 、
6 !
7 ?
8 :
9 ;
10 `
11 ﹑
12 •
13 "
14 ^
15 …
16 ‘
17 ’
18 “
19 ”
20 〝
21 〞
22 ~
23
24 ∕
25 |
idf.utf8:
IDF(Inverse Document Frequency)
25 在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。