仅对CIFAR10的搜索和重新训练做了全面的注释
class Network(nn.Module):
def __init__(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3):#C:通道数(16);layer:层数(8)
super(Network, self).__init__()
self._C = C
self._num_classes = num_classes
self._layers = layers
self._criterion = criterion
self._steps = steps#一个cell内有4个node需要进行operation操作的搜索
self._multiplier = multiplier
C_curr = stem_multiplier*C# 当前Sequential模块的输出通道数~~~C_curr=48,C=16
#通过卷积层将通道数扩充到48
self.stem = nn.Sequential(
nn.Conv2d(3, C_curr, 3, padding=1, bias=False),#in_channel:3,out_channel:C_curr;kernel_size:3*3;padding:1
nn.BatchNorm2d(C_curr)#归一化处理
)#将通道数扩充为48
C_prev_prev, C_prev, C_curr = C_curr, C_curr, C # C_prev_prev=48, C_prev=48,c_curr=16
self.cells = nn.ModuleList()#建立一个空的ModuleList
reduction_prev = False#连接的前一个cell是否是reduction cell
for i in range(layers):
if i in [layers//3, 2*layers//3]:
C_curr *= 2
reduction = True#网络的1/3,2/3处为reduction cell
else:
reduction = False#其余位置为norml cell
cell = Cell(steps, multiplier, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)#构建Cell
reduction_prev = reduction#reduction_prev是下一个Cell的参数,就等于上一个的reduction情况
self.cells += [cell]#加入当前Cell
C_prev_prev, C_prev = C_prev, multiplier*C_curr#四个node采用concat方式连接,所以C需要承4
self.global_pooling = nn.AdaptiveAvgPool2d(1)#构建一个平均池化层,output size是1x1
self.classifier = nn.Linear(C_prev, num_classes)#构建一个分类器
self._initialize_alphas()#初始化参数
def new(self):#新建一个Network,并将当前对象的架构参数复制至新建的对象
model_new = Network(self._C, self._num_classes, self._layers, self._criterion).cuda()
for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
x.data.copy_(y.data)
return model_new
def forward(self, input):#重写forward函数
s0 = s1 = self.stem(input)#self.stem扩充通道数
for i, cell in enumerate(self.cells):#遍历cells中的8个细胞
if cell.reduction:#为每一个细胞赋权重
weights = F.softmax(self.alphas_reduce, dim=-1)
else:
weights = F.softmax(self.alphas_normal, dim=-1)
s0, s1 = s1, cell(s0, s1, weights)#第k个细胞有两个输入,分别是第k-1,k-2个细胞
out = self.global_pooling(s1)
logits = self.classifier(out.view(out.size(0),-1))#classifier=liner
return logits
def _loss(self, input, target):
logits = self(input)#调用父类__call__,调用forward
return self._criterion(logits, target) #返回交叉熵损失
def _initialize_alphas(self):#Cell参数初始化
k = sum(1 for i in range(self._steps) for n in range(2+i))#参数一共有14行,即cell有14条边待选择
num_ops = len(PRIMITIVES)#每行(条边)有8种选择
self.alphas_normal = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)#初始化normal cell的alphas
self.alphas_reduce = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)#初始化reduction cell的alphas
self._arch_parameters = [#设置初始化参数
self.alphas_normal,
self.alphas_reduce,
]
def arch_parameters(self):
return self._arch_parameters
# 根据训练结果获得训练后的Cell
def genotype(self):
def _parse(weights):#经过softmax后的weight weight=[14*8]
gene = []
n = 2
start = 0#确定节点的前置边开始的那条边
for i in range(self._steps):#self._steps=4
end = start + n#确定节点的前置边结束的那条边{[0,2],[2,5],[5,9],[9,14]}
W = weights[start:end].copy()#复制第i节点所有入度边到W。W长度分别为2,3,4,5
edges = sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2]
#选出包含最大权重的两条边(每条边上有8个操作)||range(i+2)该点入度的边数,即x的取值范围||sort排序是从小到大,所以取负数
for j in edges:#便利需要选取操作的边
k_best = None
for k in range(len(W[j])):
if k != PRIMITIVES.index('none'):
if k_best is None or W[j][k] > W[j][k_best]:
k_best = k#便利每条边的8个操作,选出权重最大的那个操作
gene.append((PRIMITIVES[k_best], j))#将该操作加入gene,gene[14*2],第一位是操作,第二维是该操作对应的前直接点的序号
start = end
n += 1#后一个node的入度边数比前一个节点多1
return gene
#_parse函数饭后权重采样之后的权重信息
gene_normal = _parse(F.softmax(self.alphas_normal, dim=-1).data.cpu().numpy())#对normal cell的参数做softmax,将数据取出放在CPU上并转换为numpy格式
gene_reduce = _parse(F.softmax(self.alphas_reduce, dim=-1).data.cpu().numpy())#对reduction cell做同样操作
concat = range(2+self._steps-self._multiplier, self._steps+2)#[2,3,4,5]
genotype = Genotype(
normal=gene_normal, normal_concat=concat,
reduce=gene_reduce, reduce_concat=concat
)
return genotype在这里插入代码片
moder_search -> class Cell
class Cell(nn.Module):
def __init__(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev):
super(Cell, self).__init__()
self.reduction = reduction
#input node结构是固定的,不需要参与搜索
if reduction_prev:#上一个Cell是reduction Cell,这决定第一个node的结构。
self.preprocess0 = FactorizedReduce(C_prev_prev, C, affine=False)
else:
self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0, affine=False)#第一个input node第k-2个cell的输出,通道数为C_prev_prev
self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0, affine=False)#第二个input node是第k-1个cell的输出,通道数为C_prev
self._steps = steps#self._steps=4,每个cell中有4个节点的连接状态待确定
self._multiplier = multiplier
self._ops = nn.ModuleList()#构建operation的ModuleList
self._bns = nn.ModuleList()
for i in range(self._steps):
for j in range(2+i):#对于每一个节点,它有2+i个前驱节点
stride = 2 if reduction and j < 2 else 1#reduction Cell补偿为2,normal cell步长为1
op = MixedOp(C, stride)#构建两个节点之间的混合操作
self._ops.append(op)#所有操作添加到_ops,len(_ops)=14~2+3+4+5
def forward(self, s0, s1, weights):
s0 = self.preprocess0(s0)#s0是第一个输入
s1 = self.preprocess1(s1)#s1是第二个输入
states = [s0, s1]#当前节点的前驱节点
offset = 0
# 遍历每个intermediate nodes,得到每个节点的output
for i in range(self._steps):
# s为当前节点i的output,在ops找到i对应的操作,然后对i的所有前驱节点做相应的操作(调用了MixedOp的forward),然后把结果相加
s = sum(self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states))#_ops中存放的是MixedOP类的对象,相当于调用MxedOP的__call__->forward
offset += len(states)#下一个节点的起始行数
states.append(s)#把当前节点i的output作为下一个节点的输入[s0,s1,b1,b2,b3,b4]
return torch.cat(states[-self._multiplier:], dim=1)#对节点的output进行concat作为当前cell的输出
model_search -> MixedOp
class MixedOp(nn.Module):#构建operation内部的操作
def __init__(self, C, stride):
super(MixedOp, self).__init__()
self._ops = nn.ModuleList()#构建一个空的ModuleList
for primitive in PRIMITIVES:#ORIMITIVES中存储所有操作~genotypes.py
op = OPS[primitive](C, stride, False)#OPS中定义了所有操作的函数
if 'pool' in primitive:
op = nn.Sequential(op, nn.BatchNorm2d(C, affine=False))#给池化操作后面加一个batchnormalization
self._ops.append(op)#把这些op都放在预先定义好的modulelist里
def forward(self, x, weights):
return sum(w * op(x) for w, op in zip(weights, self._ops))#输出值乘上权重相加,权重即α
train_search -> def main
def main():
if not torch.cuda.is_available():
logging.info('no gpu device available')
sys.exit(1)
np.random.seed(args.seed)
torch.cuda.set_device(args.gpu)
cudnn.benchmark = True
torch.manual_seed(args.seed)
cudnn.enabled=True
torch.cuda.manual_seed(args.seed)
logging.info('gpu device = %d' % args.gpu)
logging.info("args = %s", args)
criterion = nn.CrossEntropyLoss()
criterion = criterion.cuda()
#Network初始化一个8层的网络
model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)#init_channels:16,CIFAR_CLASSES:10;layers:8;criterion:EntropyLoss
model = model.cuda()
logging.info("param size = %fMB", utils.count_parameters_in_MB(model))
#设置优化器
optimizer = torch.optim.SGD(
model.parameters(),#待优化参数,此处是w
args.learning_rate,#学习率
momentum=args.momentum,#动量因子0.9
weight_decay=args.weight_decay)#正则化参数
train_transform, valid_transform = utils._data_transforms_cifar10(args)#图像变化处理
train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform)#train=True(训练集)=False(测试机),transform:数据增强变换
num_train = len(train_data)#训练集图片的数量 CIFAR10:50000
indices = list(range(num_train))#list[1,2,3,...,num_train]
split = int(np.floor(args.train_portion * num_train))#np.floor(x):不大于x的最大整数
train_queue = torch.utils.data.DataLoader(#加载训练用数据集
train_data, batch_size=args.batch_size,#dataset:dataset from which to load;batch_size:每次喂给神经网络多少行数据
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),#采样数据集,前一半数据用来验证~CIFAR10:[25001,50000]
pin_memory=True, num_workers=2)#首先采样数据集,再按照batch_size分成一个一个batch用于训练
valid_queue = torch.utils.data.DataLoader(#加载验证用数据集
train_data, batch_size=args.batch_size,
sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),#采样数据集,后一半数据用来验证~CIFAR10:[25001,50000]
pin_memory=True, num_workers=2)
############
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(#调整学习率,出自文章《SGDR: Stochastic Gradient DescentWarm Restarts》有空再看
optimizer, float(args.epochs), eta_min=args.learning_rate_min)####此处有空时专门研究####
############
architect = Architect(model, args)#用来优化架构参数,文章核心
for epoch in range(args.epochs):#文中定义 epoch:600~26:92.91
scheduler.step()
lr = scheduler.get_lr()[0]#得到本次训练的学习率
logging.info('epoch %d lr %e', epoch, lr)
genotype = model.genotype()#获得离散之后的结果,每个节点只有两个前置操作
logging.info('genotype = %s', genotype)
print(F.softmax(model.alphas_normal, dim=-1))#输出normal cell的α矩阵
print(F.softmax(model.alphas_reduce, dim=-1))#输出reduction cell的α矩阵
# training
train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr)#模型训练
logging.info('train_acc %f', train_acc)#输出训练正确率
# validation
valid_acc, valid_obj = infer(valid_queue, model, criterion)#模型验证
logging.info('valid_acc %f', valid_acc)#输出验证正确率
utils.save(model, os.path.join(args.save, 'weights.pt'))入代码片
utils.py -> class Cutout
```python
class Cutout(object):#Cutout 数据增强
def __init__(self, length):
self.length = length
def __call__(self, img):
h, w = img.size(1), img.size(2)#获得图片的长宽信息
mask = np.ones((h, w), np.float32)#返回一个全1的h*w数组
y = np.random.randint(h)#返回一个范围(0,h)的整数
x = np.random.randint(w)#返回一个范围(0,w)的整数
y1 = np.clip(y - self.length // 2, 0, h)#y=np.clip(a,b,c) a,b,c:int
y2 = np.clip(y + self.length // 2, 0, h)#if a<b a=b;else if a>c a=c
x1 = np.clip(x - self.length // 2, 0, w)#这而是确定cutout的取值边界
x2 = np.clip(x + self.length // 2, 0, w)#cutout的取值范围2*length * 2*length
mask[y1: y2, x1: x2] = 0.#取值范围内像素点置为0
mask = torch.from_numpy(mask)#to tensor
mask = mask.expand_as(img)#将mask的维度扩充至为image一致
img *= mask#实现cutout 选中区域被置为0
return img
def _data_transforms_cifar10(args):
CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),#CenterCrop是以输入图的中心点为中心点做指定size的crop操作
transforms.RandomHorizontalFlip(),#是随机的图像水平翻转,即将图像的左右对调
transforms.ToTensor(),#Convert a PIL Image or numpy.ndarray to tensor
transforms.Normalize(CIFAR_MEAN, CIFAR_STD),#做数据归一化的,一般都会对输入数据做这样的操作
])
if args.cutout:
train_transform.transforms.append(Cutout(args.cutout_length))#Cotout数据增强
valid_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
])
return train_transform, valid_transform
utils.py -> def create_exp_dir
def create_exp_dir(path, scripts_to_save=None):
if not os.path.exists(path):
os.mkdir(path)
print('Experiment dir : {}'.format(path))#输出文件建立记录
if scripts_to_save is not None:
os.mkdir(os.path.join(path, 'scripts'))#建立path/scripts目录
for script in scripts_to_save:
dst_file = os.path.join(path, 'scripts', os.path.basename(script))
shutil.copyfile(script, dst_file)#复制所有文件到scripts下
architect.py -> class Architect
class Architect(object):
def __init__(self, model, args):
self.network_momentum = args.momentum
self.network_weight_decay = args.weight_decay
self.model = model
self.optimizer = torch.optim.Adam(self.model.arch_parameters(),
lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay)
def _compute_unrolled_model(self, input, target, eta, network_optimizer):
loss = self.model._loss(input, target)#对model进行一次训练,获取交叉熵损失,获得的的是Ltraib(w,α)
theta = _concat(self.model.parameters()).data#把参数整理成一行代表一个参数的形式,得到我们要更新的参数theta
try:###此处需要先学习带有动量的梯度下降法###
moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum)#network_momentum=0.9,momentum*v,用的就是Network进行w更新的momentum
except:
moment = torch.zeros_like(theta)#不加momentum
dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta#前面的是loss对参数theta求梯度,后面是正则项,即 dwLtrain(w,α)+weight_decay*theta
unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta))#w'=w − ξ*dwLtrain(w, α)
return unrolled_model
def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled):
self.optimizer.zero_grad()#清除上一步残留的参数值
if unrolled:#如unrolled==True,则使用论文中提出的方法
self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer)#eta=learning rate
else:
self._backward_step(input_valid, target_valid)
self.optimizer.step()
def _backward_step(self, input_valid, target_valid):
loss = self.model._loss(input_valid, target_valid)
loss.backward()
def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer):
#w' = w − ξ*dwLtrain(w, α)
unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer)
# Lval(w',α)
unrolled_loss = unrolled_model._loss(input_valid, target_valid)
unrolled_loss.backward()
#dαLval(w',α)
dalpha = [v.grad for v in unrolled_model.arch_parameters()]
#dwLval(w',α)
vector = [v.grad.data for v in unrolled_model.parameters()]
# (dαLtrain(w+,α)-dαLtrain(w-,α))/(2*epsilon)
implicit_grads = self._hessian_vector_product(vector, input_train, target_train)
for g, ig in zip(dalpha, implicit_grads):
g.data.sub_(eta, ig.data)
# 公式六减公式八 dαLval(w',α)-(dαLtrain(w+,α)-dαLtrain(w-,α))/(2*epsilon)
for v, g in zip(self.model.arch_parameters(), dalpha):
if v.grad is None:
v.grad = Variable(g.data)
else:
v.grad.data.copy_(g.data)
def _construct_model_from_theta(self, theta):#theta=w'=w − ξ*dwLtrain(w, α)
model_new = self.model.new()#model_new有self有共同的架构参数
model_dict = self.model.state_dict()## Returns a dictionary containing a whole state of the module.
params, offset = {}, 0
for k, v in self.model.named_parameters():
v_length = np.prod(v.size())#获取参数量
params[k] = theta[offset: offset+v_length].view(v.size())#将named_parameters中参数复制到params
offset += v_length
assert offset == len(theta)
model_dict.update(params)#更新参数地点
model_new.load_state_dict(model_dict)#model_new的参数等于更新后的参数
return model_new.cuda()
# 计算(dαLtrain(w+,α)-dαLtrain(w-,α))/(2*epsilon) 其中w+=w + dw'Lval(w',α)*epsilon w- =w - dw'Lval(w',α)*epsilon
def _hessian_vector_product(self, vector, input, target, r=1e-2):#vector就是dw'Lval(w',α)
R = r / _concat(vector).norm()# epsilon
for p, v in zip(self.model.parameters(), vector):
p.data.add_(R, v) # 将模型中所有的w'更新成w+=w+dw'Lval(w',α)*epsilon
loss = self.model._loss(input, target)
grads_p = torch.autograd.grad(loss, self.model.arch_parameters())
# dαLtrain(w-,α)
for p, v in zip(self.model.parameters(), vector):
p.data.sub_(2*R, v)# 将模型中所有的w'更新成w- = w+ - (w-)*2*epsilon = w+dw'Lval(w',α)*epsilon - 2*epsilon*dw'Lval(w',α)=w-dw'Lval(w',α)*epsilon
loss = self.model._loss(input, target)
grads_n = torch.autograd.grad(loss, self.model.arch_parameters())
# 将模型的参数从w-恢复成w
for p, v in zip(self.model.parameters(), vector):
p.data.add_(R, v)
# w=(w-) +dw'Lval(w',α)*epsilon = w-dw'Lval(w',α)*epsilon + dw'Lval(w',α)*epsilon = w
return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]