#如果不为不在语料库中的单词创建一个单独的标志,例如"",则可能出现下述情况
当seed_text为"I went to dublin"时,长度为4;当seed_text为"Lawrence went to dublin"时,长度为3,因为Lawrence不再字典中
seed_text = "I went to dublin"
next_words = 2
for _ in range(next_words):
token_list = tokenizer.texts_to_sequences([seed_text])[0]
print("="*30)
print(len(token_list))
#当seed_text为"I went to dublin"时,长度为4;当seed_text为"Lawrence went to dublin"时,长度为3,因为Lawrence不再字典中
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict_classes(token_list, verbose=0)
output_word = ""
for word, index in tokenizer.word_index.items():
if index == predicted:
output_word = word
break
seed_text += " " + output_word
print(seed_text)
为避免此种情况,在Tokenizer()初始化时应将""添加
oov_tok = "<UNK>"#添加<UNK>
vocab_size = 100 #词典大小
tokenizer = Tokenizer(num_wods = vocab_size,oov_token = oov_tok)