# Run script to generate the reverse toy dataset
# The generated data is stored in data/toy_reverse by default
scripts/toy.sh
查看得知:toy.sh 中内容为 python generation.pygenerate_dataset(toy_dir, 'test', 1000)#创建测试数据(都是类似于1 2 44 2 1具包含正序和逆序的)
TRAIN_PATH=data/toy_reverse/train/data.txt
DEV_PATH=data/toy_reverse/dev/data.txt
# Start training
python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH
运行【from torch.optim.lr_scheduler import StepLR 】时出现错误:no model name lr_scheduler
改为【from torch.optim import lr_scheduler 】依然报错:can ot import lr_scheduler
改为【from torch import optim 】则可以成功运行,运行过程如下:
2017-11-05 18:02:38,303 root INFO Namespace(dev_path='data/toy_reverse/dev/data.txt', expt_dir='./experiment', load_checkpoint=None, log_level='info', resume=False, train_path='data/toy_reverse/train/data.txt')
2017-11-05 18:02:46,009 seq2seq.trainer.supervised_trainer INFO Optimizer: <torch.optim.adam.Adam object at 0x3521c10>, Scheduler: None
2017-11-05 18:02:49,285 seq2seq.trainer.supervised_trainer INFO Progress: 1%, Train Perplexity: 13.2855 ······
2017-11-05 18:03:01,961 seq2seq.trainer.supervised_trainer INFO Finished epoch 1: Train Perplexity: 7.0578, Dev Perplexity: 156.0297, Accuracy: 0.4440
2017-11-05 18:03:02,061 seq2seq.trainer.supervised_trainer INFO Progress: 17%, Train Perplexity: 771.4534 ·。。。
2017-11-05 18:03:12,004 seq2seq.trainer.supervised_trainer INFO Progress: 33%, Train Perplexity: 1.0049
2017-11-05 18:03:12,811 seq2seq.trainer.supervised_trainer INFO Finished epoch 2: Train Perplexity: 30.4321, Dev Perplexity: 40.3084, Accuracy: 0.5819
2017-11-05 18:03:12,867 seq2seq.trainer.supervised_trainer INFO Progress: 33%, Train Perplexity: 81.9282
2017-11-05 18:03:12,980 seq2seq.trainer.supervised_trainer INFO Progress: 34%, Train Perplexity: 5.6954···。。。
2017-11-05 18:03:23,701 seq2seq.trainer.supervised_trainer INFO Progress: 49%, Train Perplexity: 1.0036
2017-11-05 18:03:24,764 seq2seq.trainer.supervised_trainer INFO Finished epoch 3: Train Perplexity: 5.5954, Dev Perplexity: 1.0226, Accuracy: 0.9917
2017-11-05 18:03:24,781 seq2seq.trainer.supervised_trainer INFO Progress: 50%, Train Perplexity: 1.1207
2017-11-05 18:03:24,908 seq2seq.trainer.supervised_trainer INFO Progress: 50%, Train Perplexity: 1.1238···。。。
2017-11-05 18:04:05,345 seq2seq.trainer.supervised_trainer INFO Progress: 99%, Train Perplexity: 1.0001
2017-11-05 18:04:05,830 seq2seq.trainer.supervised_trainer INFO Progress: 99%, Train Perplexity: 1.0001
2017-11-05 18:04:07,324 seq2seq.trainer.supervised_trainer INFO Finished epoch 6: Train Perplexity: 1.0521, Dev Perplexity: 339.5657, Accuracy: 0.5387
It will take about 3 minutes to train on CPU and less than 1 minute with a Tesla K80. Once training is complete, you will be prompted to enter a new sequence to translate and the model will print out its prediction (use ctrl-C to terminate). Try the example below!
Input: 1 3 5 7 9
Expected output: 9 7 5 3 1 EOS
②examples/sample.py
import os
import argparse
import logging
import torch
from torch.optim.lr_scheduler import StepLR # 学习率调度程序torch.optim.lr_scheduler
import torchtext
import seq2seq # 文件夹seq2seq
from seq2seq.trainer import SupervisedTrainer # 文件夹seq2seq.trainer 下的init.py的内容为 from .supervised_trainer import SupervisedTrainer
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq # 三个py文件
from seq2seq.loss import Perplexity # 困惑度:由loss计算得出【http://blog.csdn.net/jiaqiang_ruan/article/details/77989459】
from seq2seq.optim import Optimizer
from seq2seq.dataset import SourceField, TargetField
from seq2seq.evaluator import Predictor
from seq2seq.util.checkpoint import Checkpoint#都是
try:
raw_input # Python 2 :内置函数raw_input :直接读取控制台的输入【定义raw_input函数,为了防止因python版本不同出现错误】
except NameError:
raw_input = input # Python 3
# Sample usage:
# # training
# python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH --expt_dir $EXPT_PATH
# # 从实验的最新检查点恢复
# python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH --expt_dir $EXPT_PATH--resume# # 从一个特定的检查点恢复
# python examples/sample.py --train_path $TRAIN_PATH --dev_path $DEV_PATH --expt_dir $EXPT_PATH--load_checkpoint $CHECKPOINT_DIR
parser = argparse.ArgumentParser()
parser.add_argument('--train_path', action='store', dest='train_path',
help='Path to train data') #无default所以需要人 输入$TRAIN_PATH $DEV_PATH
parser.add_argument('--dev_path', action='store', dest='dev_path',
help='Path to dev data') # 开发数据 路径
parser.add_argument('--expt_dir', action='store', dest='expt_dir', default='./experiment',
help='Path to experiment directory. If load_checkpoint is True, then path to checkpoint directory has to be provided') # 实验目录 路径
parser.add_argument('--load_checkpoint', action='store', dest='load_checkpoint',
help='The name of the checkpoint to load, usually an encoded time string') ##加载检查点的名称
parser.add_argument('--resume', action='store_true', dest='resume', default=False,
help='Indicates if training has to be resumed from the latest checkpoint')##指示是否必须从最新的检查点恢复培训
parser.add_argument('--log-level', dest='log_level', default='info',
help='Logging level.')
opt = parser.parse_args() # 命令行解析函数 创建解析对象 添加参数 进行解析(从命令行参数中返回数据)
LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, opt.log_level.upper()))
logging.info(opt)
if opt.load_checkpoint is not None: #如果有检查点 则从检查点开始
logging.info("loading checkpoint from {}".format( os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint )))
checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint)
checkpoint = Checkpoint.load(checkpoint_path)
seq2seq = checkpoint.model
input_vocab = checkpoint.input_vocab
output_vocab = checkpoint.output_vocab
else: # 否则【 Prepare dataset】
src = SourceField() # 来自seq2seq/dataset文件夹下的field.py③# train /data.txt的每一行形式如123 321;包括了source和target
tgt = TargetField() # 实例化得到的两个对象
max_len = 50
def len_filter(example):
return len(example.src) <= max_len and len(example.tgt) <= max_len
train = torchtext.data.TabularDataset( #https://github.com/pytorch/text/blob/master/torchtext/data/dataset.py
path=opt.train_path, format='tsv',#由给定的路径下的文件(以及格式)创建一个表格数据集
fields=[('src', src), ('tgt', tgt)], # 本类的构造函数中:(src tgt是field)将field()变成列表形式
filter_pred=len_filter # bool类型 判断是否跳过第一行
)
dev = torchtext.data.TabularDataset(#实例化得到的两个对象
path=opt.dev_path, format='tsv',
fields=[('src', src), ('tgt', tgt)],
filter_pred=len_filter
)
src.build_vocab(train, max_size=50000) #执行函数建立词汇对象④https://github.com/pytorch/text/blob/master/torchtext/data/field.py
tgt.build_vocab(train, max_size=50000) #50000只用来传递给vocab的构造函数
input_vocab = src.vocab #④上一步执行函数后得到vocab,传递给变量 input_vocab
output_vocab = tgt.vocab
# seq2seq.src_field_name = 'src' # seq2seq.tgt_field_name = 'tgt'
# Prepare loss准备loss函数
weight = torch.ones(len(tgt.vocab))#weight = 一个所有值都是1的矩阵【len(tgt.vocab)】
pad = tgt.vocab.stoi[tgt.pad_token]#tgt.pad_token用作填充的字符串记号
loss = Perplexity(weight, pad) #seq2seq.loss // Perplexity⑤
if torch.cuda.is_available():#看是否支持cuda 是则调用这个函数 以后的计算使用gpu来计算
loss.cuda()
seq2seq = None
optimizer = None
if not opt.resume: #不是从检查点恢复
# Initialize model初始化模型
hidden_size=128
bidirectional = True # 双向的
encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True)⑥#是否使用可变长的RNN
decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else 1,
dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id)
seq2seq = Seq2seq(encoder, decoder) #三个对象
if torch.cuda.is_available(): seq2seq.cuda()
for param in seq2seq.parameters():
param.data.uniform_(-0.08, 0.08)
# 通过显式构造对象和传递给训练器 Optimizer & learning rate scheduler can be 定制 # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)
# scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler)
# train
t = SupervisedTrainer(loss=loss, batch_size=32, checkpoint_every=50,print_every=10,expt_dir=opt.expt_dir)
seq2seq = t.train ( seq2seq, train, num_epochs=6, dev_data=dev, optimizer=optimizer, teacher_forcing_ratio=0.5, resume=opt.resume )
predictor = Predictor(seq2seq, input_vocab, output_vocab)
while True: # 训练完成后 执行该循环 ctrlc 跳出
seq_str = raw_input("Type in a source sequence:")
seq = seq_str.strip().split() # strip()删除空白字符 split()按空白字符分割
print( predictor.predict(seq) )
③ seq2seq/dataset文件夹下的field.py
import logging
import torchtext
class SourceField( torchtext.data.Field ): ’‘’‘’‘包装类of torchtext.data.Field 迫使batch_first和include_lengths = True.’‘’‘’‘
def __init__(self, **kwargs):
logger = logging.getLogger(__name__)# 对日志对象进行初始化 name可不填
if kwargs.get( ' batch_first ' ) is False:
logger.warning ( " Option batch_first has to be set to use pytorch-seq2seq. Changed to True." )
kwargs[ ' batch_first ' ] = True
if kwargs.get( ' batch_first ' ) is False:# 应为 include_lengths
logger.warning( " Option include_lengths has to be set to use pytorch-seq2seq. Changed to True." )
kwargs['include_lengths'] = True
super(SourceField, self).__init__(**kwargs) # 从torchtext.data.Field继承
class TargetField(torchtext.data.Field):
""" batch_first = True # prepend <sos> & append <eos> to sequences in preprocessing step. sos_id: index of the start of sentence symbol"""
SYM_SOS = '<sos>'
SYM_EOS = '<eos>'
def __init__(self, **kwargs):
logger = logging.getLogger(__name__)
if kwargs.get('batch_first') == False:
logger.warning("Option batch_first has to be set to use pytorch-seq2seq. Changed to True.")
kwargs['batch_first'] = True
if kwargs.get( ' preprocessing ' ) is None:
kwargs[ ' preprocessing ' ] = lambda seq: [self.SYM_SOS] +seq + [self.SYM_EOS]#匿名函数
else:
func = kwargs[ ' preprocessing ' ]
kwargs['preprocessing'] = lambda seq: [self.SYM_SOS] +func(seq) + [self.SYM_EOS]
self.sos_id = None
self.eos_id = None
super(TargetField, self).__init__(**kwargs)
def build_vocab(self, *args, **kwargs):
super(TargetField, self).build_vocab(*args, **kwargs)
self.sos_id = self.vocab.stoi [self.SYM_SOS]#stoi 字符串to整型
self.eos_id = self.vocab.stoi [self.SYM_EOS]
⑤#seq2seq.loss // Perplexity ⑤loss = Perplexity(weight, pad)
⑥ seq2seq.models // EncoderRNN, DecoderRNN, Seq2seq # 三个py文件
class EncoderRNN(BaseRNN):
r"""
Applies a multi-layer RNN to an input sequence.将一个多层RNN应用于一个输入序列
Args:
vocab_size (int): size of the vocabulary
max_len (int): a maximum allowed length for the sequence to be processed #50
hidden_size (int): the number of features in the hidden state `h` #128
input_dropout_p (float, optional): dropout probability for the input sequence (default: 0) #退出概率:防止过拟合
dropout_p (float, optional): dropout probability for the output sequence (default: 0)
n_layers (int, optional): number of recurrent layers (default: 1) #重复层数
bidirectional (bool, optional): if True, becomes a bidirectional encodr (defulat False)#是否双向
rnn_cell (str, optional): type of RNN cell (default: gru)#rnn单元类型 默认gru
variable_lengths (bool, optional): if use variable length RNN (default: False)#是否可变长
Inputs: inputs, input_lengths
- **inputs**: 序列列表,其长度为批处理大小在该序列列表中的每个sequence is a list of token IDs.
- **input_lengths** (list of int, optional): list that contains the lengths of sequences in the mini-batch, it must be provided when using variable length RNN (default: `None`)
Outputs: output, hidden
- **output** (batch, seq_len, hidden_size): tensor containing输入序列的编码特征
- **hidden** (num_layers * num_directions, batch, hidden_size): tensor containing the features in the hidden state `h`
Examples::
>>> encoder = EncoderRNN(input_vocab, max_seq_length, hidden_size)
>>> output, hidden = encoder(input)
"""
def __init__(self, vocab_size, max_len, hidden_size, input_dropout_p=0, dropout_p=0,n_layers=1, bidirectional=False, rnn_cell='gru', variable_lengths=False):
super(EncoderRNN, self).__init__(vocab_size, max_len, hidden_size, input_dropout_p, dropout_p, n_layers, rnn_cell)
self.variable_lengths = variable_lengths
self.embedding = nn.Embedding(vocab_size, hidden_size)#(len(src.vocab),Hiddensize=128)
self.rnn = self.rnn_cell(hidden_size, hidden_size, n_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout_p)
def forward(self, input_var, input_lengths=None):
"""
Applies a multi-layer RNN to an input sequence.
Args:
input_var (batch, seq_len): tensor containing the features of the input sequence.
input_lengths (list of int, optional): A list that contains the lengths of sequences in the mini-batch
Returns: output, hidden
- **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
- **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
"""
embedded = self.embedding(input_var) #输入是对应的索引列表,输出是词嵌入
embedded = self.input_dropout(embedded) #随机除去一些防止噪声防止过拟合#RNN单元的隐层结点数量
if self.variable_lengths: #若是可变长rnn
embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first=True)
output, hidden = self.rnn(embedded)
if self.variable_lengths:
output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
return output, hidden
相似的seq2seq模型讲解(tensorflow不是pytorch但是原理相同):讲解的很清晰:http://www.sohu.com/a/155746637_505915
其中的超参数设置:
Number of Epochs:epochs = 60# #训练数据一共过几遍
Batch Size:batch_size = 128# #每次处理多少样本(使用pad进行补全,一个batch内的样本要具有相同的序列长度)
RNN Size:rnn_size = 50# #RNN单元的隐层结点数量
Number of Layers:num_layers = 2# #堆叠的RNN单元数量
Embedding Size:encoding_embedding_size = 15 = decoding_embedding_size = 15# #embedding的大小
Learning Ratelearning_rate = 0.001
train loss 和 test loss上升和下降结果分析:http://blog.csdn.net/smf0504/article/details/71698354
train loss 训练集 在模型中训练结果和预测结果的误差