解决测试集上tokenizer.texts_to_sequences()编码问题

子车勇锐

2023-12-01

解决测试集上tokenizer.texts_to_sequences()编码问题

预料十分脏乱会导致分词后测试集里面很多词汇在训练集建立的vocab里面没有，如果利用tokenizer.texts_to_sequences编码，会自动忽略这些没有的词，会损失很多信息。对这问题进行改进。
例如：

# 训练集vocab：
{1: '了', 2: '，', 3: '~', 4: '么', 5: '气死', 6: '姐姐', 7: '快二是', 8: '阵亡', 9: '吗', 10: '尼玛', 11: '一个半', 12: '小时', 13: '过去', 14: '也', 15: '没', 16: '上车', 17: '妞妞', 18: '啊', 19: '今天', 20: '又', 21: '承办', 22: '一个', 23: '发文', 24: '登记', 25: '文号', 26: '是', 27: '126', 28: '嘻', 29: '哒', 30: '晚安', 31: '哟'}

# 训练集句子：
['气死', '姐姐', '了', '，', '快二是', '阵亡', '了', '吗', '，', '尼玛', '，', '一个半', '小时', '过去', '了', '也', '没', '上车']
# 训练集句子编码：
[ 5  6  1  2  7  8  1  9  2 10  2 11 12 13  1 14 15 16  0  0  0  0]

# 测试集句子：
 [宿舍, 要民汉合宿, 了, 为, 毛, 都, 大三, 了, 还要, 折腾, 我]
 # 测试集句子编码：
 [1, 1, 0, 0]
# 可以看出 很多重要信息 被忽略

解决方法：
我们可以扩大训练集vocab(10,0000)，给测试集中后面的词语留一些位置。那就不能使用tokenizer.texts_to_sequences()了。

class WordToken(object):
    def __init__(self, vocab_path, train_corpus, max_vocab_len=100000, max_len=140):
        self.vocab_path = vocab_path
        self.vocab_list = set([wd for sen in train_corpus for wd in sen])
        # 初始化vocab
        self._init_vocab(self.vocab_path, self.vocab_list)
        self.word2id = self._load_vocab(self.vocab_path)
        self.vocab_len = len(self.word2id)
        self.max_len = max_len
        self.max_vocab_len = max_vocab_len

    def _init_vocab(self, vocab_path, vocab_list):
        if not os.path.exists(vocab_path):
            with open(vocab_path, 'w') as f:
                f.write('<UNK>')
                f.write('\n')  # bilm-tf 要求vocab有这三个符号，并且在最前面
                for word in vocab_list:
                    f.write(word)
                    f.write('\n')

    def _load_vocab(self, vocab_path):
        word2id = {}
        with open(vocab_path, 'r') as f:
            for index, token in enumerate(f.readlines()):
                token = token.rstrip('\n')
                word2id[token] = index
        return word2id

    def sen2token_train(self, train_corpus):
        train_seq = []
        for sen in list(train_corpus):
            train_seq.append([self.word2id[wd] for wd in sen])
        train_seq = pad_sequences(train_seq, maxlen=self.max_len, padding='post')
        return train_seq

    def _update_vocab(self, vocab_path, updata_vacab):
        if self.vocab_len < self.max_vocab_len:
            if self.vocab_len + len(updata_vacab) > self.max_vocab_len:
                updata_vacab = updata_vacab[:self.max_vocab_len - self.vocab_len]
                print(updata_vacab)
            with open(vocab_path, 'a+') as f:
                for word in updata_vacab:
                    f.write(word)
                    f.write('\n')
        print('update vocab!')

    def sen2token_test(self, test_corpus):
        # 更新vocab和word2id
        updata_vocab = [voc for voc in set([wd for sen in test_corpus for wd in sen if wd not in self.word2id.keys()])]
        print(updata_vocab)
        self._update_vocab(self.vocab_path, updata_vocab)
        self.word2id = self._load_vocab(self.vocab_path)
        test_seq = []
        for sen in list(test_corpus):
            test_seq.append([self.word2id[wd] if wd in self.word2id.keys() else 0 for wd in sen])
        test_seq = pad_sequences(test_seq, maxlen=self.max_len, padding='post')
        return test_seq

# 直接生成train_seq，test_seq
 vocab_path = 'data/vocab.txt'
 tk = WordToken(vocab_path, train_data)
 train_seq = tk.sen2token_train(train_data)
 test_seq = tk.sen2token_test(test_data)

解决测试集上tokenizer.texts_to_sequences()编码问题

解决测试集上tokenizer.texts_to_sequences()编码问题

相关阅读

相关文章

相关问答

相关文档