LibMMSeg是Coreseek.com为Sphinx全文搜索引擎设计的中文分词软件包,采用Chih-Hao Tsai的MMSEG算法,可以参见这里。
下面是为Python所做的模块代码,根据mmseg_main做了部分修改:
char** segment(const char* dict_path, const char* file_path, int *nseg)
{
char **segs = NULL;
*nseg = 0;
istream *is = new std::ifstream(file_path, ios::in | ios::binary);
auto_ptr<istream> pauto_is(is);
if (! *is) {
return NULL;
}
Segmenter *seg = NULL;
SegmenterManager* mgr = new SegmenterManager();
auto_ptr<SegmenterManager> pauto_mgr(mgr);
if(dict_path) {
if ( mgr->init(dict_path) == 0 )
seg = mgr->getSegmenter();
}
if (!seg) return NULL;
std::string line;
int n = 0;
unsigned long srch,str;
str = currentTimeMillis();
//load data.
int length;
is->seekg (0, ios::end);
length = is->tellg();
is->seekg (0, ios::beg);
char* buffer = new char [length+1];
is->read (buffer,length);
buffer[length] = 0;
//begin seg
seg->setBuffer((u1*)buffer,length);
u2 len = 0, symlen = 0;
//check 1st token.
char txtHead[3] = {239,187,191};
char* tok = (char*)seg->peekToken(len, symlen);
seg->popToken(len);
if(len == 3 && memcmp(tok,txtHead,sizeof(char)*3) == 0){
//check is 0xFEFF
//do nothing
}
else{
//printf("%*.*s/X ",symlen,symlen,tok);
segs = (char**)realloc(segs, (*nseg + 1) * sizeof(char*));
segs[*nseg] = (char*)calloc(symlen, sizeof(char));
memcpy(segs[*nseg], tok, symlen);
++(*nseg);
}
while(1){
len = 0;
char* tok = (char*)seg->peekToken(len,symlen);
if(!tok || !*tok || !len)
break;
seg->popToken(len);
if(*tok == '/r')
continue;
if(*tok == '/n'){
//printf("/n");
continue;
}
//printf("%*.*s/X ",symlen,symlen,tok);
segs = (char**)realloc(segs, (*nseg + 1) * sizeof(char*));
segs[*nseg] = (char*)calloc(symlen, sizeof(char));
memcpy(segs[*nseg], tok, symlen);
++(*nseg);
}
srch = currentTimeMillis() - str;
//printf("/n/nWord Splite took: %d ms./n", srch);
//found out the resultreturn segs;
}
#ifndef NOPYTHON
PyObject* seg_text(PyObject *self, PyObject *args)
{
char *dict_path = NULL;
char *file_path = NULL;
int ok = PyArg_ParseTuple(args, "ss", &dict_path, &file_path);
fprintf(stderr, "Get Dict[%s], File[%s]./n", dict_path, file_path);
PyObject *List = PyList_New(0);
int nseg;
char **segs = segment(dict_path, file_path, &nseg);
int i;
for (i = 0; i < nseg; i++) {
if (segs[i]) {
PyList_Append(List, Py_BuildValue("s", segs[i]));
free(segs[i]);
}
}
if (segs) free(segs);
return List;
}static PyMethodDef Methods[] = {
{ "seg", seg_text, METH_VARARGS, "Seg Text" },
{ NULL, NULL, 0, NULL }
};void initLibPyMMSeg()
{
PyObject *m;
m = Py_InitModule("LibPyMMSeg", Methods);
}