本文为学习Datawhale 2021.9组队学习情感分析笔记
原学习文档地址:https
"""1.set seed"""
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
"""2.load data"""
# 设置分词 标记器spacy,并指定使用模型
TEXT = data.Field(tokenize='spacy', tokenizer_language= 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)
# 如果之前运行过,数据会保存在那个文件夹里,这里运行的时候就不会再下了,可以手动把数据放一份到该目录下的.data
# 可以点进函数里手动下载后再保存
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
# 设置seed确保每次的分割结果相同
train_data, valid_data = train_data.split(split_ratio=0.8, random_state=random.seed(SEED))
# print(f'Number of training examples: {len(train_data)}') # 20000
# print(f'Number of validation examples: {len(valid_data)}') # 5000
# print(f'Number of testing examples: {len(test_data)}') # 25000
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
数据存到了notebook目录下的.data文件夹里,在另一个目录里的py文件中再运行一遍,只是新建了一个.data文件夹,并没有下载,这里是把原来下载的数据又拷贝到这个目录下。MAX_VOCAB_SIZE = 25000
# 使用train_data建立字典,注意这里只在训练集上建立词汇表,因为不希望以任何方式影响测试集和验证集。
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
# build_vocab主要是给每个单词编号,便于后续one-hot
# vocab对象下有stoi (string to int) 和 itos (int to string) 方法
# print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}") # 250002 多[unk]和[pad]
# print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}") # 2
"""4.iterator"""
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# “BucketIterator”,它是一种特殊类型的迭代器,它将返回一批示例,其中每个样本的长度差不多,从而最小化每个样本的padding数。
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device)
class RNN(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.rnn = nn.RNN(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# text = [sent len, batch size]
embedded = self.embedding(text)
# embedded = [sent len, batch size, emb dim]
output, hidden = self.rnn(embedded)
# output = [sen len, batch size, hidden dim] 所有cell 的输出
# hidden = [1, batch size, hid dim] 最后一个cell 的输出
assert torch.equal(output[-1,:,:], hidden.squeeze(0))
# 把hidden形状最外面的1去掉, hidden.squeeze(0):[batch size, hid dim]
# 再过fc -> [batch size, hid dim, 1]
return self.fc(hidden.squeeze(0))
tensor([[ 492, 0, 105, ..., 3896, 892, 11848],
[ 219, 18147, 3, ..., 1765, 2, 11801],
[ 3, 0, 152, ..., 20, 62, 435],
...,
[ 1, 1, 1, ..., 1, 1, 1],
[ 1, 1, 1, ..., 1, 1, 1],
[ 1, 1, 1, ..., 1, 1, 1]])
后面的1就都是padding,数量是有788对64的余数决定的,788/64=12···20,padding数量应该是20个
上面那样理解不对,看了text[767]基本上还都是全1
所以这里的batch_size=64指的是一批有64个句子,而不是一批有一个句子的64个单词,那么batch_size[:][0]
就表示64个句子的第一个单词,不要被上面tensor里下面的全1迷惑了,那些省略号里有不是1的。。
接下来text进embedding:text[788, 64] -> embedded[788, 64, 100],按道理前面的输入维数应该是one-hot(感觉是会自动转换的?),维数是字典中词的个数25002,经过embedding后变为100维,embedded[0]就是所有句子的第一个单词的词嵌入。
再往后embedded进rnn输出output和hidden,hidden再进全连接层输出。
"""5.define the model"""
INPUT_DIM = len(TEXT.vocab) # 25002
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
# 输出模型结构
# print(model)
# 输出模型参数量
# print(f'The model has {count_parameters(model):,} trainable parameters')
"""6.define the func"""
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
model= model.to(device)
criterion = criterion.to(device)
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
# round predictions to the closest integer
round_preds = torch.round(torch.sigmoid(preds))
correct = (round_preds == y).float()
acc = correct.sum() / len(correct)
return acc
# 计算每一个epoch会消耗多少时间
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time/60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
"""Train func"""
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
# [batch size, hid dim, 1] -> [batch size, hid dim]
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
"""7.Train"""
# 通过多个 epoch 来训练模型,每一个 epoch 是对训练和验证集中所有样本的完整传递。
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
# 在每个epoch,如果在验证集上的损失值是迄今为止我们所见过的最好的,我们将保存模型的参数,
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut1-model.pt')
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc:{valid_acc*100:.2f}%')
"""Evaluate func"""
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
# evlauate与train基本相同,只是会关掉dropout和batch normalization
model.eval()
# 且不再需要更新参数
with torch.no_grad():
for batch in iterator:
#optimizer.zero_grad()
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
#loss.backward()
#optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)z
"""8.Evaluate"""
model.load_state_dict(torch.load('tut1-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'\t Test Loss: {test_loss:.3f} | Test Acc:{test_acc*100:.2f}%')