import collections import os import shutil import tqdm
import numpy as np import PIL.Image import torch import torchvision
基础配置
检查 PyTorch 版本
1 2 3 4
torch.__version__ # PyTorch version torch.version.cuda # Corresponding CUDA version torch.backends.cudnn.version() # Corresponding cuDNN version torch.cuda.get_device_name(0) # GPU type
torch.nonzero(tensor) # Index of non-zero elements torch.nonzero(tensor == 0) # Index of zero elements torch.nonzero(tensor).size(0) # Number of non-zero elements torch.nonzero(tensor == 0).size(0) # Number of zero elements
张量扩展
1 2
# Expand tensor of shape 64*512 to shape 64*512*7*7. torch.reshape(tensor, (64, 512, 1, 1)).expand(64, 512, 7, 7)
# Common practise for initialization. for layer in model.modules(): if isinstance(layer, torch.nn.Conv2d): torch.nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu') if layer.bias is not None: torch.nn.init.constant_(layer.bias, val=0.0) elif isinstance(layer, torch.nn.BatchNorm2d): torch.nn.init.constant_(layer.weight, val=1.0) torch.nn.init.constant_(layer.bias, val=0.0) elif isinstance(layer, torch.nn.Linear): torch.nn.init.xavier_normal_(layer.weight) if layer.bias is not None: torch.nn.init.constant_(layer.bias, val=0.0)
# Initialization with given tensor. layer.weight = torch.nn.Parameter(tensor)
def forward(self, x): with torch.no_grad(): conv_representation = [] for name, layer in self._model.named_children(): x = layer(x) if name in self._layers_to_extract: conv_representation.append(x) return conv_representation
model = torchvision.models.resnet18(pretrained=True) for param in model.parameters(): param.requires_grad = False model.fc = nn.Linear(512, 100) # Replace the last fc layer optimizer = torch.optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
以较大学习率微调全连接层,较小学习率微调卷积层
1 2 3 4 5 6
model = torchvision.models.resnet18(pretrained=True) finetuned_parameters = list(map(id, model.fc.parameters())) conv_parameters = (p for p in model.parameters() if id(p) not in finetuned_parameters) parameters = [{'params': conv_parameters, 'lr': 1e-3}, {'params': model.fc.parameters()}] optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)
for t in epoch(80): for images, labels in tqdm.tqdm(train_loader, desc='Epoch %3d' % (t + 1)): images, labels = images.cuda(), labels.cuda() scores = model(images) loss = loss_function(scores, labels) optimizer.zero_grad() loss.backward() optimizer.step()
标记平滑(label smoothing)
1 2 3 4 5 6 7 8 9 10 11 12 13
for images, labels in train_loader: images, labels = images.cuda(), labels.cuda() N = labels.size(0) # C is the number of classes. smoothed_labels = torch.full(size=(N, C), fill_value=0.1 / (C - 1)).cuda() smoothed_labels.scatter_(dim=1, index=torch.unsqueeze(labels, dim=1), value=0.9)
score = model(images) log_prob = torch.nn.functional.log_softmax(score, dim=1) loss = -torch.sum(log_prob * smoothed_labels) / N optimizer.zero_grad() loss.backward() optimizer.step()
Mixup
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
beta_distribution = torch.distributions.beta.Beta(alpha, alpha) for images, labels in train_loader: images, labels = images.cuda(), labels.cuda()
l1_regularization = torch.nn.L1Loss(reduction='sum') loss = ... # Standard cross-entropy loss for param in model.parameters(): loss += torch.sum(torch.abs(param)) loss.backward()
不对偏置项进行 L2 正则化/权值衰减(weight decay)
1 2 3 4 5
bias_list = (param for name, param in model.named_parameters() if name[-4:] == 'bias') others_list = (param for name, param in model.named_parameters() if name[-4:] != 'bias') parameters = [{'parameters': bias_list, 'weight_decay': 0}, {'parameters': others_list}] optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)
# If there is one global learning rate (which is the common case). lr = next(iter(optimizer.param_groups))['lr']
# If there are multiple learning rates for different layers. all_lr = [] for param_group in optimizer.param_groups: all_lr.append(param_group['lr'])
学习率衰减
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Reduce learning rate when validation accuarcy plateau. scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=5, verbose=True) for t in range(0, 80): train(...); val(...) scheduler.step(val_acc)
# Cosine annealing learning rate. scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80) # Reduce learning rate by 10 at given epochs. scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 70], gamma=0.1) for t in range(0, 80): scheduler.step() train(...); val(...)
# Learning rate warmup by 10 epochs. scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda t: t / 10) for t in range(0, 10): scheduler.step() train(...); val(...)
# data['label'] and data['prediction'] are groundtruth label and prediction # for each image, respectively. accuracy = np.mean(data['label'] == data['prediction']) * 100
# Compute recision and recall for each class. for c in range(len(num_classes)): tp = np.dot((data['label'] == c).astype(int), (data['prediction'] == c).astype(int)) tp_fp = np.sum(data['prediction'] == c) tp_fn = np.sum(data['label'] == c) precision = tp / tp_fp * 100 recall = tp / tp_fn * 100