#!/usr/bin/python #coding=utf-8 ''' If there are Chinese comments in the code,please add at the beginning: #!/usr/bin/python #coding=utf-8 1,The dataset structure of the multi-dataset in this example MnistDataset_torch.zip ├── test └── train checkpoint_epoch1_0.73.zip ├── mnist_epoch1_0.73.pkl 2,Due to the adaptability of a100, before using the training environment, please use the recommended image of the platform with cuda 11.Then adjust the code and submit the image. The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. Note: the paths are different when selecting a single dataset and multiple datasets. (1)If it is a single dataset: if MnistDataset_torch.zip is selected, the dataset directory is /dataset/train, /dataset/test; The dataset structure of the single dataset in the training image in this example: dataset ├── test └── train (2)If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl The dataset structure in the training image for multiple datasets in this example: dataset ├── MnistDataset_torch | ├── test | └── train └── checkpoint_epoch1_0.73 ├── mnist_epoch1_0.73.pkl The model download path is under /model by default. Please specify the model output location to /model, and the Qizhi platform will provide file downloads under the /model directory. ''' from model import Model import numpy as np import torch from torchvision.datasets import mnist from torch.nn import CrossEntropyLoss from torch.optim import SGD from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') #The dataset location is placed under /dataset parser.add_argument('--traindata', default="/dataset/MnistDataset_torch/train" ,help='path to train dataset') parser.add_argument('--testdata', default="/dataset/MnistDataset_torch/test" ,help='path to test dataset') parser.add_argument('--checkpoint', default="/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl" ,help='checkpoint file') parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') if __name__ == '__main__': args, unknown = parser.parse_known_args() #log output print('cuda is available:{}'.format(torch.cuda.is_available())) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = args.batch_size train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) train_loader = DataLoader(train_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) model = Model().to(device) sgd = SGD(model.parameters(), lr=1e-1) cost = CrossEntropyLoss() epoch = args.epoch_size print('epoch_size is:{}'.format(epoch)) # Load the trained model # path = args.checkpoint # checkpoint = torch.load(path, map_location=device) # model.load_state_dict(checkpoint) for _epoch in range(epoch): print('the {} epoch_size begin'.format(_epoch + 1)) model.train() for idx, (train_x, train_label) in enumerate(train_loader): train_x = train_x.to(device) train_label = train_label.to(device) label_np = np.zeros((train_label.shape[0], 10)) sgd.zero_grad() predict_y = model(train_x.float()) loss = cost(predict_y, train_label.long()) if idx % 10 == 0: print('idx: {}, loss: {}'.format(idx, loss.sum().item())) loss.backward() sgd.step() correct = 0 _sum = 0 model.eval() for idx, (test_x, test_label) in enumerate(test_loader): test_x = test_x test_label = test_label predict_y = model(test_x.to(device).float()).detach() predict_ys = np.argmax(predict_y.cpu(), axis=-1) label_np = test_label.numpy() _ = predict_ys == test_label correct += np.sum(_.numpy(), axis=-1) _sum += _.shape[0] print('accuracy: {:.2f}'.format(correct / _sum)) #The model output location is placed under /model torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))