不想为了面试而面试,找实习的事还是顺其自然,每天刷刷题就行,这样整天都在看水题效率极低,也水不了几题。还是得学点有用的东西:
9-8日目标:搞清楚mmseg算法,分别用python和C++实现。
https://github.com/jdeng/mmseg
mmseg算法简介
其关键是:
1.匹配3个词得到的词组长度尽量要长
2.每个词也要尽可能长
3.每个词要尽可能长度接近
4.单个词的词频也要较为接近
python实现:词频直接用dict来保存,词典用Trie树,直接简单粗暴的用树结构。
对照着代码,勉强调通了程序,但有些地方还没理解透彻:
# -*- coding: UTF-8 -*-
import codecs
import sys
from math import log
from collections import defaultdict
class Trie(object):
class TrieNode(object):
def __init__(self):
self.val = 0
self.trans = {}
def __init__(self):
self.root = Trie.TrieNode()
def add(self, word, value=1):
curr_node = self.root
for ch in word:
if ch in curr_node.trans:
curr_node = curr_node.trans[ch]
else:
curr_node.trans[ch] = Trie.TrieNode()
curr_node = curr_node.trans[ch]
curr_node.val = value
# print len(self.root.trans)
def __walk(self, trie_node, ch):
if ch in trie_node.trans:
trie_node = trie_node.trans[ch]
return trie_node, trie_node.val
else:
return None, 0
def match_all(self, word):
"""
返回trie树中的匹配
"""
ret = []
curr_node = self.root
# print word
for ch in word:
curr_node, val = self.__walk(curr_node, ch)
if not curr_node:
break
# print ch, val
if val:
ret.append(val)
# print "match: " + str(ret)
return ret
class Dict(Trie):
def __init__(self, fname):
super(Dict, self).__init__()
self.load(fname)
def load(self, fname):
with codecs.open(fname, 'r', 'utf-8') as f:
for line in f:
word = line.strip()
self.add(word, word)
# print word
# print_root_ch(self.root)
# break
class CharFreqs(defaultdict):
def __init__(self, fname):
super(CharFreqs, self).__init__(lambda: 1) # 词频初始为0
self.load(fname)
def load(self, fname):
with codecs.open(fname, 'r', 'utf-8') as f:
for line in f:
ch, freq = line.strip().split()
self[ch] = freq
class MMSeg(object):
class Chunk(object):
"""
词组
"""
def __init__(self, words, chs):
self.words = words
# 每个词的长度
self.lens = map(lambda x: len(x), words)
# 次的个数
self.length = sum(self.lens)
# 平均长度
self.mean = float(self.length) / len(words)
# 方差
self.var = sum(map(lambda x: (x - self.mean) ** 2, self.lens)) / len(words)
# 单字词频
self.degree = sum([log(float(chs[x])) for x in words if len(x) == 1])
def __str__(self):
return ' '.join(self.words).encode('UTF-8') + "(%f %f %f %f)" % \
(self.length, self.mean, self.var, self.degree)
def __lt__(self, other):
return (self.length, self.mean, -self.var, self.degree) < \
(other.length, other.mean, -other.var, other.degree)
def __init__(self, dic, chs):
self.dic = dic
self.chs = chs
def __get_chunks(self, s, depth=3):
ret = []
def __get_chunks_it(s, num, segs):
if (num == 0 or not s) and segs:
ret.append(MMSeg.Chunk(segs, self.chs))
else:
m = self.dic.match_all(s)
# print "s: " + s
# print m
if not m:
__get_chunks_it(s[1:], num - 1, segs + [s[0]])
for w in m:
__get_chunks_it(s[len(w):], num - 1, segs + [w])
__get_chunks_it(s, depth, [])
return ret
def segment(self, s):
while s:
chunks = self.__get_chunks(s)
best = max(chunks)
yield best.words[0]
s = s[len(best.words[0]):]
def test():
words = ['字段', '找到', '知道']
lens = map(lambda x: len(x), words)
print lens
length = sum(lens)
print length
def print_root_ch(root):
for k in root.trans.keys():
print k,
print ''
def check_exist(root, c):
if c in root.trans:
print u'存在:' + c
def debug_helper(dic):
print_root_ch(dic.root)
check_exist(dic.root, u'一')
check_exist(dic.root, u'简')
root = dic.root.trans[u'简']
print_root_ch(root)
check_exist(root, u'单')
rt = root.trans[u'单']
print rt.val
if __name__ == '__main__':
chs = CharFreqs('dict/data/chars.dic')
dic = Dict('dict/data/words.dic')
mmseg = MMSeg(dic, chs)
# debug_helper(dic)
# 每次分一句话:
str1 = u"简单的正向匹配"
for w in mmseg.segment(str1):
print w
# test()