一、数据集下载
此处为Google colab下载方式,此处附上已下载资源https://download.csdn.net/download/qq_37767529/85063986
tr_path = 'covid.train.csv' # path to training data
tt_path = 'covid.test.csv' # path to testing data!gdown --id '19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF' --output covid.train.csv
!gdown --id '1CE240jLm2npU-tdz81-oVKEF3T2yfT1O' --output covid.test.csv
基本信息分布:
二、导入相关包
常规使用的一些包以及GPU的描述
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader# For data preprocess
import numpy as np
import csv
import os# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figuremyseed = 42069 # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():torch.cuda.manual_seed_all(myseed)
三、封装好的绘图函数
感兴趣可以查看文档研究
def get_device():''' Get device (if GPU is available, use GPU) '''return 'cuda' if torch.cuda.is_available() else 'cpu'def plot_learning_curve(loss_record, title=''):''' Plot learning curve of your DNN (train & dev loss) '''total_steps = len(loss_record['train'])x_1 = range(total_steps)x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]figure(figsize=(6, 4))plt.plot(x_1, loss_record['train'], c='tab:red', label='train')plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')plt.ylim(0.0, 5.)plt.xlabel('Training steps')plt.ylabel('RMSE loss')plt.title('Learning curve of {}'.format(title))plt.legend()plt.show()def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):''' Plot prediction of your DNN '''if preds is None or targets is None:model.eval()preds, targets = [], []for x, y in dv_set:x, y = x.to(device), y.to(device)with torch.no_grad():pred = model(x)preds.append(pred.detach().cpu())targets.append(y.detach().cpu())preds = torch.cat(preds, dim=0).numpy()targets = torch.cat(targets, dim=0).numpy()figure(figsize=(5, 5))plt.scatter(targets, preds, c='r', alpha=0.5)plt.plot([-0.2, lim], [-0.2, lim], c='b')plt.xlim(-0.2, lim)plt.ylim(-0.2, lim)plt.xlabel('ground truth value')plt.ylabel('predicted value')plt.title('Ground Truth v.s. Prediction')plt.show()
四、处理数据集
from sklearn.model_selection import train_test_split#此处为pytorch中自定义数据集结构的规定写法
class COVID19Dataset(Dataset):''' Dataset for loading and preprocessing the COVID19 dataset '''def __init__(self,path,mode='train',target_only=False):self.mode = mode# Read data into numpy arrayswith open(path, 'r') as fp:data = list(csv.reader(fp))data = np.array(data[1:])[:, 1:].astype(float)if not target_only:#feats = list(range(93))#此处已经过特征选取feats = [40, 41, 42, 43, 57, 58, 59, 60, 61, 75, 76, 77, 78, 79, 92]else:# TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)passif mode == 'test':# Testing data# data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))data = data[:, feats]self.data = torch.FloatTensor(data)else:# Training data (train/dev sets)# data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))target = data[:, -1]data = data[:, feats]#采用sklearn包中的train_test_split函数,随机拆分出训练集和测试集,比例为9:1train_indices, valid_indices = train_test_split([i for i in range(data.shape[0])], test_size=0.1, random_state=1)if mode == 'train':# Convert data into PyTorch tensorsself.data = torch.FloatTensor(data[train_indices])self.target = torch.FloatTensor(target[train_indices])elif mode == 'dev':self.data = torch.FloatTensor(data[valid_indices])self.target = torch.FloatTensor(target[valid_indices])# Normalize features (you may remove this part to see what will happen)self.data[:, 40:] = \(self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \/ self.data[:, 40:].std(dim=0, keepdim=True)self.dim = self.data.shape[1]print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'.format(mode, len(self.data), self.dim))def __getitem__(self, index):# Returns one sample at a timeif self.mode in ['train', 'dev']:# For trainingreturn self.data[index], self.target[index]else:# For testing (no target)return self.data[index]def __len__(self):# Returns the size of the datasetreturn len(self.data)
五、数据集加载器
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):''' Generates a dataset, then is put into a dataloader. '''dataset = COVID19Dataset(path, mode=mode, target_only=target_only) # Construct dataset#参数依次为,训练集,批次大小,是否打乱数据集排序,是否丢弃最后不能被batch_size整除的数据#参与训练进程的个数dataloader = DataLoader(dataset, batch_size,shuffle=(mode == 'train'), drop_last=False,num_workers=n_jobs, pin_memory=True) # Construct dataloaderreturn dataloader
六、深度学习网络
采用64层隐藏层网络,损失函数采用RMSE,源代码采用MSE
class NeuralNet(nn.Module):''' A simple fully-connected deep neural network '''def __init__(self, input_dim):super(NeuralNet, self).__init__()# Define your neural network here# TODO: How to modify this model to achieve better performance?self.net = nn.Sequential(nn.Linear(input_dim, 64),nn.ReLU(),nn.Linear(64, 1))# Mean squared error lossself.criterion = nn.MSELoss(reduction='mean')for param in self.net.parameters():nn.init.normal_(param, mean=0, std=0.01)def forward(self, x):''' Given input of size (batch_size x input_dim), compute output of the network '''return self.net(x).squeeze(1)def cal_loss(self, pred, target):''' Calculate loss '''# TODO: you may implement L2 regularization herereturn torch.sqrt(self.criterion(pred, target))
七、训练函数
此处参数已经封装在config中
def train(tr_set, dv_set, model, config, device):''' DNN training '''n_epochs = config['n_epochs'] # Maximum number of epochs# Setup optimizeroptimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas'])min_mse = 1000.loss_record = {'train': [], 'dev': []} # for recording training lossearly_stop_cnt = 0epoch = 0while epoch < n_epochs:model.train() # set model to training modefor x, y in tr_set: # iterate through the dataloaderoptimizer.zero_grad() # set gradient to zerox, y = x.to(device), y.to(device) # move data to device (cpu/cuda)pred = model(x) # forward pass (compute output)mse_loss = model.cal_loss(pred, y) # compute lossmse_loss.backward() # compute gradient (backpropagation)optimizer.step() # update model with optimizerloss_record['train'].append(mse_loss.detach().cpu().item())# After each epoch, test your model on the validation (development) set.dev_mse = dev(dv_set, model, device)if dev_mse < min_mse:# Save model if your model improvedmin_mse = dev_mseprint('Saving model (epoch = {:4d}, loss = {:.4f})'.format(epoch + 1, min_mse))torch.save(model.state_dict(), config['save_path']) # Save model to specified pathearly_stop_cnt = 0else:early_stop_cnt += 1epoch += 1loss_record['dev'].append(dev_mse)if early_stop_cnt > config['early_stop']:# Stop training if your model stops improving for "config['early_stop']" epochs.breakprint('Finished training after {} epochs'.format(epoch))return min_mse, loss_record
八、测试函数
def test(tt_set, model, device):model.eval() # set model to evalutation modepreds = []for x in tt_set: # iterate through the dataloaderx = x.to(device) # move data to device (cpu/cuda)with torch.no_grad(): # disable gradient calculationpred = model(x) # forward pass (compute output)preds.append(pred.detach().cpu()) # collect predictionpreds = torch.cat(preds, dim=0).numpy() # concatenate all predictions and convert to a numpy arrayreturn preds
九、参数设置
device = get_device() # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True) # The trained model will be saved to ./models/
target_only = False # TODO: Using 40 states & 2 tested_positive features# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {'n_epochs': 3000, # maximum number of epochs'batch_size': 270, # mini-batch size for dataloader'optimizer': 'Adam', # optimization algorithm (optimizer in torch.optim)'optim_hparas': { # hyper-parameters for the optimizer (depends on which optimizer you are using)'lr': 0.001, # learning rate of SGD#'momentum': 0.9 # momentum for SGD},'early_stop': 200, # early stopping epochs (the number epochs since your model's last improvement)'save_path': 'models/model.pth' # your model will be saved here
}
十、训练过程
r_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)model = NeuralNet(tr_set.dataset.dim).to(device) # Construct model and move to devicemodel_loss, model_loss_record = train(tr_set, dv_set, model, config, device)plot_learning_curve(model_loss_record, title='deep model')del model
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu') # Load your best model
model.load_state_dict(ckpt)
plot_pred(dv_set, model, device) # Show prediction on the validation set#测试集训练
def save_pred(preds, file):''' Save predictions to specified file '''print('Saving results to {}'.format(file))with open(file, 'w') as fp:writer = csv.writer(fp)writer.writerow(['id', 'tested_positive'])for i, p in enumerate(preds):writer.writerow([i, p])preds = test(tt_set, model, device) # predict COVID-19 cases with your model
save_pred(preds, 'pred.csv')
学习曲线,以及预测值线性关系