pytorch中与保存和加载模型有关函数有三个:
1.torch.save:将序列化的对象保存到磁盘。此函数使用Python的pickle实用程序进行序列化。使用此功能可以保存各种对象的模型,张量和字典。
2. torch.load:使用pickle的unpickle工具将pickle的对象文件反序列化到内存中。即加载save保存的东西。
3. torch.nn.Module.load_state_dict:使用反序列化的state_dict加载模型的参数字典。注意,这意味着它的传入的参数应该是一个state_dict类型,也就torch.load加载出来的。
模型搭建:
# Define model
class TheModelClass(nn.Module):
def __init__(self):
super(TheModelClass, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# Initialize model
model = TheModelClass()
# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
print(param_tensor, "\t", model.state_dict()[param_tensor].size())
# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
print(var_name, "\t", optimizer.state_dict()[var_name])
output:
Model's state_dict:
conv1.weight torch.Size([6, 3, 5, 5])
conv1.bias torch.Size([6])
conv2.weight torch.Size([16, 6, 5, 5])
conv2.bias torch.Size([16])
fc1.weight torch.Size([120, 400])
fc1.bias torch.Size([120])
fc2.weight torch.Size([84, 120])
fc2.bias torch.Size([84])
fc3.weight torch.Size([10, 84])
fc3.bias torch.Size([10])
Optimizer's state_dict:
state {}
param_groups [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'pa
恢复训练实例
保存模型和加载模型的函数如下:
def save_checkpoint_state(dir,epoch,model,optimizer):
#保存模型
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
}
if not os.path.isdir(dir):
os.mkdir(dir)
torch.save(checkpoint, os.path.join(dir,'checkpoint-epoch%d.tar'%(epoch)))
def get_checkpoint_state(dir,ckp_name,device,model,optimizer):
# 恢复上次的训练状态
print("Resume from checkpoint...")
checkpoint = torch.load(os.path.join(dir,ckp_name),map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
epoch=checkpoint['epoch']
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
print('sucessfully recover from the last state')
return model,epoch,optimizer
如果加入了lr_scheduler,那么lr_scheduler的state_dict也要加进来。
使用时:
# 引用包省略
#保持模型函数
def save_checkpoint_state(epoch, model, optimizer, scheduler, running_loss):
checkpoint = {
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"scheduler_state_dict": scheduler.state_dict()
}
torch.save(checkpoint, "checkpoint-epoch%d-loss%d.tar" % (epoch, running_loss))
# 加载模型函数
def load_checkpoint_state(path, device, model, optimizer, scheduler):
checkpoint = torch.load(path, map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
epoch = checkpoint["epoch"]
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
return model, epoch, optimizer, scheduler
# 是否恢复训练(如果是恢复训练,那么需要设置为true)
resume = False # True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train():
trans = transforms.Compose([
transforms.ToPILImage(),
transforms.RandomResizedCrop(512),
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.RandomRotation(90),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# get training dataset
leafDiseaseCLS = CustomDataSet(images_path, is_to_ls, trans)
data_loader = DataLoader(leafDiseaseCLS,
batch_size=16,
num_workers=0,
shuffle=True,
pin_memory=False)
# get model
model = EfficientNet.from_pretrained("efficientnet-b3")
# extract the parameter of fully connected layer
fc_features = model._fc.in_features
# modify the number of classes
model._fc = nn.Linear(fc_features, 5)
model.to(device)
# optimizer
optimizer = optim.SGD(model.parameters(),
lr=0.001,
momentum=0.9,
weight_decay=5e-4)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[6, 10], gamma=1/3.)
# loss
#loss_func = nn.CrossEntropyLoss()
loss_func = FocalCosineLoss()
start_epoch = -1
if resume:
model, start_epoch, optimizer,scheduler = load_checkpoint_state("../path/to/checkpoint.tar",
device,
model,
optimizer,
scheduler)
model.train()
epochs = 3
for epoch in range(start_epoch + 1, epochs):
running_loss = 0.0
print("Epoch {}/{}".format(epoch, epochs))
for step, train_data in tqdm(enumerate(data_loader)):
x_train, y_train = train_data
x_train = Variable(x_train.to(device))
y_train = Variable(y_train.to(device))
# forward
prediction = model(x_train)
optimizer.zero_grad()
loss = loss_func(prediction, y_train)
running_loss += loss.item()
# backward
loss.backward()
optimizer.step()
scheduler.step()
# saving model
torch.save(model.state_dict(), str(int(running_loss)) + "_" + str(epoch) + ".pth")
save_checkpoint_state(epoch, model, optimizer, scheduler, running_loss)
print("Loss:{}".format(running_loss))
if __name__ == "__main__":
train()
加载部分预训练模型
大多数时候我们需要根据我们的任务调节我们的模型,所以很难保证模型和公开的模型完全一样,但是预训练模型的参数确实有助于提高训练的准确率,为了结合二者的优点,就需要我们加载部分预训练模型。
pretrained_dict = torch.load("model_data/yolo_weights.pth", map_location=device)
model_dict = model.state_dict()
# 将 pretrained_dict 里不属于 model_dict 的键剔除掉
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
#pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
# 更新现有的 model_dict
model_dict.update(pretrained_dict)
# 加载我们真正需要的 state_dict
model.load_state_dict(model_dict)
跨设备保存/加载模型(CPU与GPU)
模型保存在GPU上,加载到CPU
- 保存
torch.save(model.state_dict(), PATH)
- 加载:
device = torch.device('cpu')
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH, map_location=device))
模型保存在GPU上,加载到GPU
torch.save(model.state_dict(), PATH)
- 加载:
device = torch.device("cuda")
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH))
model.to(device)
# Make sure to call input = input.to(device) on any input tensors that you feed to the model
重点:在于epoch的恢复
保存的时候需要将 epoch也保存
代码:实现每隔N个epoch,save模型:
optimizer = torch.optim.SGD(model.parameters(),lr=0.1)
lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=[10,20,30,40,50],gamma=0.1)
start_epoch = 9
# print(schedule)
if RESUME:
path_checkpoint = "./model_parameter/test/ckpt_best_50.pth" # 断点路径
checkpoint = torch.load(path_checkpoint) # 加载断点
model.load_state_dict(checkpoint['net']) # 加载模型可学习参数
optimizer.load_state_dict(checkpoint['optimizer']) # 加载优化器参数
start_epoch = checkpoint['epoch'] # 设置开始的epoch
lr_schedule.load_state_dict(checkpoint['lr_schedule'])
for epoch in range(start_epoch+1,80):
optimizer.zero_grad()
optimizer.step()
lr_schedule.step()
if epoch %10 ==0:
print('epoch:',epoch)
print('learning rate:',optimizer.state_dict()['param_groups'][0]['lr'])
checkpoint = {
"net": model.state_dict(),
'optimizer': optimizer.state_dict(),
"epoch": epoch,
'lr_schedule': lr_schedule.state_dict()
}
if not os.path.isdir("./model_parameter/test"):
os.mkdir("./model_parameter/test")
torch.save(checkpoint, './model_parameter/test/ckpt_best_%s.pth' % (str(epoch)))
设置随机数种子 ,使得训练过程结果可复现
PyTorch时,如果希望通过设置随机数种子,在gpu或cpu上固定每一次的训练结果,则需要在程序执行的开始处添加以下代码:
def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(20)
# 预处理数据以及训练模型
# ...
# ...
随机数种子seed确定时,不改变程序参数情况下,两次模型的训练结果将始终保持一致。