在使用cross-entropy
时,会使得模型向预测正负标签差值无限大的方向学习;但过大的logit差值会使得模型缺乏适应性。实际上有些标注数据不一定准确,在训练数据不足的情况下,影响更大。可以用labelsmoothing
做缓解处理。
import torch
import torch.nn as nn
import torch.nn.functional as F
构造示例数据
logits = torch.randn(3, 5, requires_grad=True) # 3个样本,标签数量为5
labels = torch.randint(5, (3,), dtype=torch.int64)
print(logits)
print(labels)
loss = F.cross_entropy(logits, labels)
print(loss)
打印结果:
tensor([[ 1.2215, -0.9342, -0.1349, -0.5035, 0.4492],
[-0.1989, 0.2589, 2.1556, -0.5734, 0.8668],
[ 0.1215, 0.5060, 0.5376, -1.7609, -0.6566]], requires_grad=True)
tensor([2, 3, 2])
tensor(2.1185, grad_fn=<NllLossBackward0>)
验证:CEloss的计算 等价于 NLL_loss(log_softmax(pred), labels)
print(F.nll_loss(F.log_softmax(logits), labels)) # tensor(2.1185, grad_fn=<NllLossBackward0>)
F.cross_entropy(logits, labels, label_smoothing=0.1)
结果:
tensor(2.1038, grad_fn=<AddBackward0>)
阅读huggingface transformers中的相关源码,在trainer中计算loss时,采用loss = self.label_smoother(outputs, labels)
, 具体分析其实现。
# from huggingface transformers/trainer_pt_utils.py
class LabelSmoother:
"""
Adds label-smoothing on a pre-computed output from a Transformers model.
Args:
epsilon (`float`, *optional*, defaults to 0.1):
The label smoothing factor.
ignore_index (`int`, *optional*, defaults to -100):
The index in the labels to ignore when computing the loss.
"""
epsilon: float = 0.1
ignore_index: int = -100
def __call__(self, model_output, labels):
logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
log_probs = -nn.functional.log_softmax(logits, dim=-1)
if labels.dim() == log_probs.dim() - 1:
labels = labels.unsqueeze(-1)
padding_mask = labels.eq(self.ignore_index)
# In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask
# will ignore them in any case.
labels = torch.clamp(labels, min=0)
nll_loss = log_probs.gather(dim=-1, index=labels)
# works for fp16 input tensor too, by internally upcasting it to fp32
smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)
nll_loss.masked_fill_(padding_mask, 0.0)
smoothed_loss.masked_fill_(padding_mask, 0.0)
# Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
num_active_elements = padding_mask.numel() - padding_mask.long().sum()
nll_loss = nll_loss.sum() / num_active_elements
smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss
log_probs = -F.log_softmax(logits, dim=-1)
if labels.dim() == log_probs.dim() - 1:
labels = labels.unsqueeze(-1)
print(log_probs)
print(labels)
结果为:
tensor([[0.6999, 2.8556, 2.0563, 2.4249, 1.4721],
[2.8157, 2.3578, 0.4612, 3.1901, 1.7499],
[1.5253, 1.1407, 1.1092, 3.4077, 2.3033]], grad_fn=<NegBackward0>)
tensor([[2],
[3],
[2]])
padding_mask = labels.eq(-100)
labels = torch.clamp(labels, min=0)
print(padding_mask)
print(labels)
tensor([[False],
[False],
[False]])
tensor([[2],
[3],
[2]])
# gather用来按索引取值 https://pytorch.org/docs/stable/generated/torch.gather.html#torch.gather
nll_loss = log_probs.gather(dim=-1, index=labels) # 取
print(nll_loss)
tensor([[2.0563],
[3.1901],
[1.1092]], grad_fn=<GatherBackward0>)
s m o o t h e d _ l o s s = s u m ( l o g _ p r o b s ) 样 本 数 ∗ 标 签 数 smoothed\_loss = \cfrac{sum(log\_probs)}{样本数 * 标签数} smoothed_loss=样本数∗标签数sum(log_probs)
# 对某一样本预测所有类别的logit求和
smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)
print(smoothed_loss)
tensor([[ 9.5088],
[10.5747],
[ 9.4863]], grad_fn=<SumBackward1>)
# 有几个样本是需要算loss的
num_active_elements = padding_mask.numel() - padding_mask.long().sum()
print(num_active_elements) # tensor(3)
nll_loss = nll_loss.sum() / num_active_elements
print(nll_loss) # tensor(2.1185, grad_fn=<DivBackward0>)
smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
print(smoothed_loss) # tensor(1.9713, grad_fn=<DivBackward0>)
# 可理解为:loss为对"预测的分布与真实分布"及"预测分布与先验分布(均匀分布)"的惩罚。
print((1 - 0.1) * nll_loss + 0.1 * smoothed_loss) # tensor(2.1038, grad_fn=<AddBackward0>)
最终结果和直接使用 cross_entropy 中的label_smoothing
一致。
y
k
L
S
=
y
k
(
1
−
α
)
+
α
/
K
y_k^{LS} = y_k(1-\alpha) + \alpha/K
ykLS=yk(1−α)+α/K
K为标签数, alpha为上述实现中的epsilon
假设样本标签为2: [0,0,1,0,0]
即将变为[0.02, 0.02, 0.92, 0.02, 0.02]
print(labels)
labels_onehot = torch.zeros(3, 5).scatter_(1, labels.unsqueeze(-1), 1)
print(labels_onehot)
labels_smoothed = labels_onehot*(1-0.1) + 0.1/labels_onehot.shape[-1]
print(labels_smoothed)
print(F.cross_entropy(logits, labels_smoothed))
tensor([2, 3, 2])
tensor([[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 1., 0., 0.]])
tensor([[0.0200, 0.0200, 0.9200, 0.0200, 0.0200],
[0.0200, 0.0200, 0.0200, 0.9200, 0.0200],
[0.0200, 0.0200, 0.9200, 0.0200, 0.0200]])
tensor(2.1038, grad_fn=<DivBackward1>)