mmseg分词算法

云建木

2023-12-01

不想为了面试而面试，找实习的事还是顺其自然，每天刷刷题就行，这样整天都在看水题效率极低，也水不了几题。还是得学点有用的东西：

9-8日目标：搞清楚mmseg算法，分别用python和C++实现。

https://github.com/jdeng/mmseg

mmseg算法简介

其关键是：

1.匹配3个词得到的词组长度尽量要长
2.每个词也要尽可能长
3.每个词要尽可能长度接近
4.单个词的词频也要较为接近

python实现：词频直接用dict来保存，词典用Trie树，直接简单粗暴的用树结构。

对照着代码，勉强调通了程序，但有些地方还没理解透彻：

# -*- coding: UTF-8 -*-

import codecs
import sys
from math import log
from collections import defaultdict


class Trie(object):
    class TrieNode(object):
        def __init__(self):
            self.val = 0
            self.trans = {}

    def __init__(self):
        self.root = Trie.TrieNode()

    def add(self, word, value=1):
        curr_node = self.root
        for ch in word:
            if ch in curr_node.trans:
                curr_node = curr_node.trans[ch]
            else:
                curr_node.trans[ch] = Trie.TrieNode()
                curr_node = curr_node.trans[ch]
        curr_node.val = value

        # print len(self.root.trans)

    def __walk(self, trie_node, ch):
        if ch in trie_node.trans:
            trie_node = trie_node.trans[ch]
            return trie_node, trie_node.val
        else:
            return None, 0

    def match_all(self, word):
        """
        返回trie树中的匹配
        """
        ret = []
        curr_node = self.root

        # print word
        for ch in word:

            curr_node, val = self.__walk(curr_node, ch)
            if not curr_node:
                break

            # print ch, val
            if val:
                ret.append(val)
        # print "match: " + str(ret)
        return ret


class Dict(Trie):
    def __init__(self, fname):
        super(Dict, self).__init__()
        self.load(fname)

    def load(self, fname):
        with codecs.open(fname, 'r', 'utf-8') as f:
            for line in f:
                word = line.strip()
                self.add(word, word)
                # print word
                # print_root_ch(self.root)
                # break

class CharFreqs(defaultdict):
    def __init__(self, fname):
        super(CharFreqs, self).__init__(lambda: 1)  # 词频初始为0
        self.load(fname)

    def load(self, fname):
        with codecs.open(fname, 'r', 'utf-8') as f:
            for line in f:
                ch, freq = line.strip().split()
                self[ch] = freq


class MMSeg(object):
    class Chunk(object):
        """
        词组
        """

        def __init__(self, words, chs):
            self.words = words
            # 每个词的长度
            self.lens = map(lambda x: len(x), words)
            # 次的个数
            self.length = sum(self.lens)
            # 平均长度
            self.mean = float(self.length) / len(words)
            # 方差
            self.var = sum(map(lambda x: (x - self.mean) ** 2, self.lens)) / len(words)
            # 单字词频
            self.degree = sum([log(float(chs[x])) for x in words if len(x) == 1])

        def __str__(self):
            return ' '.join(self.words).encode('UTF-8') + "(%f %f %f %f)" % \
                   (self.length, self.mean, self.var, self.degree)

        def __lt__(self, other):
            return (self.length, self.mean, -self.var, self.degree) < \
                   (other.length, other.mean, -other.var, other.degree)

    def __init__(self, dic, chs):
        self.dic = dic
        self.chs = chs

    def __get_chunks(self, s, depth=3):
        ret = []

        def __get_chunks_it(s, num, segs):
            if (num == 0 or not s) and segs:
                ret.append(MMSeg.Chunk(segs, self.chs))
            else:
                m = self.dic.match_all(s)
                # print "s: " + s
                # print m
                if not m:
                    __get_chunks_it(s[1:], num - 1, segs + [s[0]])
                for w in m:
                    __get_chunks_it(s[len(w):], num - 1, segs + [w])

        __get_chunks_it(s, depth, [])
        return ret

    def segment(self, s):
        while s:
            chunks = self.__get_chunks(s)
            best = max(chunks)
            yield best.words[0]
            s = s[len(best.words[0]):]


def test():
    words = ['字段', '找到', '知道']
    lens = map(lambda x: len(x), words)
    print lens
    length = sum(lens)
    print length


def print_root_ch(root):
    for k in root.trans.keys():
        print k,
    print ''

def check_exist(root, c):
    if c in root.trans:
        print u'存在：' + c

def debug_helper(dic):
    print_root_ch(dic.root)
    check_exist(dic.root, u'一')
    check_exist(dic.root, u'简')
    root = dic.root.trans[u'简']
    print_root_ch(root)
    check_exist(root, u'单')
    rt = root.trans[u'单']
    print rt.val

if __name__ == '__main__':
    chs = CharFreqs('dict/data/chars.dic')
    dic = Dict('dict/data/words.dic')
    mmseg = MMSeg(dic, chs)
    # debug_helper(dic)
    # 每次分一句话:
    str1 = u"简单的正向匹配"
    for w in mmseg.segment(str1):
        print w
        # test()

mmseg分词算法

相关阅读

相关文章

相关问答

相关文档