解决测试集上tokenizer.texts_to_sequences()编码问题
- 预料十分脏乱会导致分词后测试集里面很多词汇在训练集建立的vocab里面没有,如果利用tokenizer.texts_to_sequences编码,会自动忽略这些没有的词,会损失很多信息。对这问题进行改进。
例如:
# 训练集vocab:
{1: '了', 2: ',', 3: '~', 4: '么', 5: '气死', 6: '姐姐', 7: '快二是', 8: '阵亡', 9: '吗', 10: '尼玛', 11: '一个半', 12: '小时', 13: '过去', 14: '也', 15: '没', 16: '上车', 17: '妞妞', 18: '啊', 19: '今天', 20: '又', 21: '承办', 22: '一个', 23: '发文', 24: '登记', 25: '文号', 26: '是', 27: '126', 28: '嘻', 29: '哒', 30: '晚安', 31: '哟'}
# 训练集句子:
['气死', '姐姐', '了', ',', '快二是', '阵亡', '了', '吗', ',', '尼玛', ',', '一个半', '小时', '过去', '了', '也', '没', '上车']
# 训练集句子编码:
[ 5 6 1 2 7 8 1 9 2 10 2 11 12 13 1 14 15 16 0 0 0 0]
# 测试集句子:
[宿舍, 要民汉合宿, 了, 为, 毛, 都, 大三, 了, 还要, 折腾, 我]
# 测试集句子编码:
[1, 1, 0, 0]
# 可以看出 很多重要信息 被忽略
- 解决方法:
我们可以扩大训练集vocab(10,0000),给测试集中后面的词语留一些位置。那就不能使用tokenizer.texts_to_sequences()了。
class WordToken(object):
def __init__(self, vocab_path, train_corpus, max_vocab_len=100000, max_len=140):
self.vocab_path = vocab_path
self.vocab_list = set([wd for sen in train_corpus for wd in sen])
# 初始化vocab
self._init_vocab(self.vocab_path, self.vocab_list)
self.word2id = self._load_vocab(self.vocab_path)
self.vocab_len = len(self.word2id)
self.max_len = max_len
self.max_vocab_len = max_vocab_len
def _init_vocab(self, vocab_path, vocab_list):
if not os.path.exists(vocab_path):
with open(vocab_path, 'w') as f:
f.write('<UNK>')
f.write('\n') # bilm-tf 要求vocab有这三个符号,并且在最前面
for word in vocab_list:
f.write(word)
f.write('\n')
def _load_vocab(self, vocab_path):
word2id = {}
with open(vocab_path, 'r') as f:
for index, token in enumerate(f.readlines()):
token = token.rstrip('\n')
word2id[token] = index
return word2id
def sen2token_train(self, train_corpus):
train_seq = []
for sen in list(train_corpus):
train_seq.append([self.word2id[wd] for wd in sen])
train_seq = pad_sequences(train_seq, maxlen=self.max_len, padding='post')
return train_seq
def _update_vocab(self, vocab_path, updata_vacab):
if self.vocab_len < self.max_vocab_len:
if self.vocab_len + len(updata_vacab) > self.max_vocab_len:
updata_vacab = updata_vacab[:self.max_vocab_len - self.vocab_len]
print(updata_vacab)
with open(vocab_path, 'a+') as f:
for word in updata_vacab:
f.write(word)
f.write('\n')
print('update vocab!')
def sen2token_test(self, test_corpus):
# 更新vocab和word2id
updata_vocab = [voc for voc in set([wd for sen in test_corpus for wd in sen if wd not in self.word2id.keys()])]
print(updata_vocab)
self._update_vocab(self.vocab_path, updata_vocab)
self.word2id = self._load_vocab(self.vocab_path)
test_seq = []
for sen in list(test_corpus):
test_seq.append([self.word2id[wd] if wd in self.word2id.keys() else 0 for wd in sen])
test_seq = pad_sequences(test_seq, maxlen=self.max_len, padding='post')
return test_seq
# 直接生成train_seq,test_seq
vocab_path = 'data/vocab.txt'
tk = WordToken(vocab_path, train_data)
train_seq = tk.sen2token_train(train_data)
test_seq = tk.sen2token_test(test_data)