PyTorch 断点训练，模型的保存和加载

内容隐藏

1 模型搭建：

2 恢复训练实例

3 加载部分预训练模型

4 跨设备保存/加载模型（CPU与GPU）

4.1 模型保存在GPU上，加载到CPU

5 模型保存在GPU上，加载到GPU

6 重点：在于epoch的恢复

7 设置随机数种子，使得训练过程结果可复现

7.1 相关文章：

pytorch中与保存和加载模型有关函数有三个：
1.torch.save:将序列化的对象保存到磁盘。此函数使用Python的pickle实用程序进行序列化。使用此功能可以保存各种对象的模型，张量和字典。
2. torch.load:使用pickle的unpickle工具将pickle的对象文件反序列化到内存中。即加载save保存的东西。
3. torch.nn.Module.load_state_dict:使用反序列化的state_dict加载模型的参数字典。注意，这意味着它的传入的参数应该是一个state_dict类型，也就torch.load加载出来的。

模型搭建：

# Define model  
class TheModelClass(nn.Module):  
    def __init__(self):  
        super(TheModelClass, self).__init__()  
        self.conv1 = nn.Conv2d(3, 6, 5)  
        self.pool = nn.MaxPool2d(2, 2)  
        self.conv2 = nn.Conv2d(6, 16, 5)  
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  
        self.fc2 = nn.Linear(120, 84)  
        self.fc3 = nn.Linear(84, 10)  
  
    def forward(self, x):  
        x = self.pool(F.relu(self.conv1(x)))  
        x = self.pool(F.relu(self.conv2(x)))  
        x = x.view(-1, 16 * 5 * 5)  
        x = F.relu(self.fc1(x))  
        x = F.relu(self.fc2(x))  
        x = self.fc3(x)  
        return x  
  
# Initialize model  
model = TheModelClass()  
  
# Initialize optimizer  
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  
  
# Print model's state_dict  
print("Model's state_dict:")  
for param_tensor in model.state_dict():  
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())  
  
# Print optimizer's state_dict  
print("Optimizer's state_dict:")  
for var_name in optimizer.state_dict():  
    print(var_name, "\t", optimizer.state_dict()[var_name])

output：

Model's state_dict:
conv1.weight     torch.Size([6, 3, 5, 5])
conv1.bias   torch.Size([6])
conv2.weight     torch.Size([16, 6, 5, 5])
conv2.bias   torch.Size([16])
fc1.weight   torch.Size([120, 400])
fc1.bias     torch.Size([120])
fc2.weight   torch.Size([84, 120])
fc2.bias     torch.Size([84])
fc3.weight   torch.Size([10, 84])
fc3.bias     torch.Size([10])

Optimizer's state_dict:
state    {}
param_groups     [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'pa

恢复训练实例

保存模型和加载模型的函数如下：

def  save_checkpoint_state(dir,epoch,model,optimizer):
	#保存模型
    checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
                }   
    if not os.path.isdir(dir):
        os.mkdir(dir)

    torch.save(checkpoint, os.path.join(dir,'checkpoint-epoch%d.tar'%(epoch)))
    
def get_checkpoint_state(dir,ckp_name,device,model,optimizer):
     # 恢复上次的训练状态
    print("Resume from checkpoint...")
    checkpoint = torch.load(os.path.join(dir,ckp_name),map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    epoch=checkpoint['epoch']

    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    #scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    print('sucessfully recover from the last state')
    return model,epoch,optimizer

如果加入了lr_scheduler，那么lr_scheduler的state_dict也要加进来。

使用时：

# 引用包省略
#保持模型函数
def save_checkpoint_state(epoch, model, optimizer, scheduler, running_loss):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict()
    }
    
    torch.save(checkpoint, "checkpoint-epoch%d-loss%d.tar" % (epoch, running_loss))
# 加载模型函数   
def load_checkpoint_state(path, device, model, optimizer, scheduler):
    checkpoint = torch.load(path, map_location=device)
    
    model.load_state_dict(checkpoint["model_state_dict"])
    
    epoch = checkpoint["epoch"]
    
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    
    scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
    
    return model, epoch, optimizer, scheduler  


# 是否恢复训练（如果是恢复训练，那么需要设置为true）
resume = False # True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(): 
    trans = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomResizedCrop(512),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation(90),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # get training dataset
    leafDiseaseCLS = CustomDataSet(images_path, is_to_ls, trans)
    
    data_loader = DataLoader(leafDiseaseCLS,
                             batch_size=16,
                             num_workers=0,
                             shuffle=True,
                             pin_memory=False)
    
    # get model
    model = EfficientNet.from_pretrained("efficientnet-b3")
    
    # extract the parameter of fully connected layer
    fc_features = model._fc.in_features
    # modify the number of classes
    model._fc = nn.Linear(fc_features, 5)
    
    model.to(device)
        
    # optimizer
    optimizer = optim.SGD(model.parameters(), 
                          lr=0.001, 
                          momentum=0.9,
                          weight_decay=5e-4)
    
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[6, 10], gamma=1/3.)
    
    # loss
    #loss_func = nn.CrossEntropyLoss()
    loss_func = FocalCosineLoss()
    
    start_epoch = -1
    
    if resume:
        model, start_epoch, optimizer,scheduler = load_checkpoint_state("../path/to/checkpoint.tar",
                                                                        device, 
                                                                        model,
                                                                        optimizer,
                                                                        scheduler)
    
    model.train()
    
    epochs = 3
    
    for epoch in range(start_epoch + 1, epochs):
        
        running_loss = 0.0
        
        print("Epoch {}/{}".format(epoch, epochs))
        
        for step, train_data in tqdm(enumerate(data_loader)):
            x_train, y_train = train_data
            
            x_train = Variable(x_train.to(device))
            y_train = Variable(y_train.to(device))
            
            # forward
            prediction = model(x_train)
            
            optimizer.zero_grad()
            
            loss = loss_func(prediction, y_train)
            
            running_loss += loss.item()
            
            # backward
            loss.backward()
            
            optimizer.step()            
            
            
        scheduler.step()
        
        # saving model
        torch.save(model.state_dict(), str(int(running_loss)) + "_" + str(epoch) + ".pth")
        
        save_checkpoint_state(epoch, model, optimizer, scheduler, running_loss)
        
        print("Loss:{}".format(running_loss))

if __name__ == "__main__":
    train()

加载部分预训练模型

大多数时候我们需要根据我们的任务调节我们的模型，所以很难保证模型和公开的模型完全一样，但是预训练模型的参数确实有助于提高训练的准确率，为了结合二者的优点，就需要我们加载部分预训练模型。

pretrained_dict = torch.load("model_data/yolo_weights.pth", map_location=device)

model_dict = model.state_dict()
# 将 pretrained_dict 里不属于 model_dict 的键剔除掉
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
#pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) ==  np.shape(v)}
# 更新现有的 model_dict
model_dict.update(pretrained_dict)
# 加载我们真正需要的 state_dict
model.load_state_dict(model_dict)

跨设备保存/加载模型（CPU与GPU）

模型保存在GPU上，加载到CPU

保存

torch.save(model.state_dict(), PATH)

加载:

device = torch.device('cpu')
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH, map_location=device))

模型保存在GPU上，加载到GPU

保存:

torch.save(model.state_dict(), PATH)

加载:

device = torch.device("cuda")
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH))
model.to(device)
# Make sure to call input = input.to(device) on any input tensors that you feed to the model

重点：在于epoch的恢复

保存的时候需要将 epoch也保存

代码：实现每隔N个epoch，save模型：

optimizer = torch.optim.SGD(model.parameters(),lr=0.1)
lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=[10,20,30,40,50],gamma=0.1)
start_epoch = 9
# print(schedule)


if RESUME:
    path_checkpoint = "./model_parameter/test/ckpt_best_50.pth"  # 断点路径
    checkpoint = torch.load(path_checkpoint)  # 加载断点

    model.load_state_dict(checkpoint['net'])  # 加载模型可学习参数

    optimizer.load_state_dict(checkpoint['optimizer'])  # 加载优化器参数
    start_epoch = checkpoint['epoch']  # 设置开始的epoch
    lr_schedule.load_state_dict(checkpoint['lr_schedule'])

for epoch in range(start_epoch+1,80):

    optimizer.zero_grad()

    optimizer.step()
    lr_schedule.step()


    if epoch %10 ==0:
        print('epoch:',epoch)
        print('learning rate:',optimizer.state_dict()['param_groups'][0]['lr'])
        checkpoint = {
            "net": model.state_dict(),
            'optimizer': optimizer.state_dict(),
            "epoch": epoch,
            'lr_schedule': lr_schedule.state_dict()
        }
        if not os.path.isdir("./model_parameter/test"):
            os.mkdir("./model_parameter/test")
        torch.save(checkpoint, './model_parameter/test/ckpt_best_%s.pth' % (str(epoch)))

设置随机数种子，使得训练过程结果可复现

PyTorch时，如果希望通过设置随机数种子，在gpu或cpu上固定每一次的训练结果，则需要在程序执行的开始处添加以下代码：

def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(20)
# 预处理数据以及训练模型
# ...
# ...

随机数种子seed确定时，不改变程序参数情况下，两次模型的训练结果将始终保持一致。

模型搭建：

恢复训练实例

加载部分预训练模型

跨设备保存/加载模型（CPU与GPU）

模型保存在GPU上，加载到CPU

模型保存在GPU上，加载到GPU

重点：在于epoch的恢复

设置随机数种子 ，使得训练过程结果可复现

相关文章：

发表评论 取消回复

设置随机数种子，使得训练过程结果可复现

发表评论取消回复