COVID-19 Cases Prediction |代码详解及strong baseline修改_综合

一、数据集下载

此处为Google colab下载方式，此处附上已下载资源https://download.csdn.net/download/qq_37767529/85063986

tr_path = 'covid.train.csv'  # path to training data
tt_path = 'covid.test.csv'   # path to testing data!gdown --id '19CCyCgJrUxtvgZF53vnctJiOJ23T5mqF' --output covid.train.csv
!gdown --id '1CE240jLm2npU-tdz81-oVKEF3T2yfT1O' --output covid.test.csv

基本信息分布：

二、导入相关包

常规使用的一些包以及GPU的描述

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader# For data preprocess
import numpy as np
import csv
import os# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figuremyseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():torch.cuda.manual_seed_all(myseed)

三、封装好的绘图函数

感兴趣可以查看文档研究

def get_device():''' Get device (if GPU is available, use GPU) '''return 'cuda' if torch.cuda.is_available() else 'cpu'def plot_learning_curve(loss_record, title=''):''' Plot learning curve of your DNN (train & dev loss) '''total_steps = len(loss_record['train'])x_1 = range(total_steps)x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]figure(figsize=(6, 4))plt.plot(x_1, loss_record['train'], c='tab:red', label='train')plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')plt.ylim(0.0, 5.)plt.xlabel('Training steps')plt.ylabel('RMSE loss')plt.title('Learning curve of {}'.format(title))plt.legend()plt.show()def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):''' Plot prediction of your DNN '''if preds is None or targets is None:model.eval()preds, targets = [], []for x, y in dv_set:x, y = x.to(device), y.to(device)with torch.no_grad():pred = model(x)preds.append(pred.detach().cpu())targets.append(y.detach().cpu())preds = torch.cat(preds, dim=0).numpy()targets = torch.cat(targets, dim=0).numpy()figure(figsize=(5, 5))plt.scatter(targets, preds, c='r', alpha=0.5)plt.plot([-0.2, lim], [-0.2, lim], c='b')plt.xlim(-0.2, lim)plt.ylim(-0.2, lim)plt.xlabel('ground truth value')plt.ylabel('predicted value')plt.title('Ground Truth v.s. Prediction')plt.show()

四、处理数据集

from sklearn.model_selection import train_test_split#此处为pytorch中自定义数据集结构的规定写法
class COVID19Dataset(Dataset):''' Dataset for loading and preprocessing the COVID19 dataset '''def __init__(self,path,mode='train',target_only=False):self.mode = mode# Read data into numpy arrayswith open(path, 'r') as fp:data = list(csv.reader(fp))data = np.array(data[1:])[:, 1:].astype(float)if not target_only:#feats = list(range(93))#此处已经过特征选取feats = [40, 41, 42, 43, 57, 58, 59, 60, 61, 75, 76, 77, 78, 79, 92]else:# TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)passif mode == 'test':# Testing data# data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))data = data[:, feats]self.data = torch.FloatTensor(data)else:# Training data (train/dev sets)# data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))target = data[:, -1]data = data[:, feats]#采用sklearn包中的train_test_split函数，随机拆分出训练集和测试集，比例为9:1train_indices, valid_indices = train_test_split([i for i in range(data.shape[0])], test_size=0.1, random_state=1)if mode == 'train':# Convert data into PyTorch tensorsself.data = torch.FloatTensor(data[train_indices])self.target = torch.FloatTensor(target[train_indices])elif mode == 'dev':self.data = torch.FloatTensor(data[valid_indices])self.target = torch.FloatTensor(target[valid_indices])# Normalize features (you may remove this part to see what will happen)self.data[:, 40:] = \(self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \/ self.data[:, 40:].std(dim=0, keepdim=True)self.dim = self.data.shape[1]print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})'.format(mode, len(self.data), self.dim))def __getitem__(self, index):# Returns one sample at a timeif self.mode in ['train', 'dev']:# For trainingreturn self.data[index], self.target[index]else:# For testing (no target)return self.data[index]def __len__(self):# Returns the size of the datasetreturn len(self.data)

五、数据集加载器

def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):''' Generates a dataset, then is put into a dataloader. '''dataset = COVID19Dataset(path, mode=mode, target_only=target_only)  # Construct dataset#参数依次为，训练集，批次大小，是否打乱数据集排序，是否丢弃最后不能被batch_size整除的数据#参与训练进程的个数dataloader = DataLoader(dataset, batch_size,shuffle=(mode == 'train'), drop_last=False,num_workers=n_jobs, pin_memory=True)                            # Construct dataloaderreturn dataloader

六、深度学习网络

采用64层隐藏层网络，损失函数采用RMSE，源代码采用MSE

class NeuralNet(nn.Module):''' A simple fully-connected deep neural network '''def __init__(self, input_dim):super(NeuralNet, self).__init__()# Define your neural network here# TODO: How to modify this model to achieve better performance?self.net = nn.Sequential(nn.Linear(input_dim, 64),nn.ReLU(),nn.Linear(64, 1))# Mean squared error lossself.criterion = nn.MSELoss(reduction='mean')for param in self.net.parameters():nn.init.normal_(param, mean=0, std=0.01)def forward(self, x):''' Given input of size (batch_size x input_dim), compute output of the network '''return self.net(x).squeeze(1)def cal_loss(self, pred, target):''' Calculate loss '''# TODO: you may implement L2 regularization herereturn torch.sqrt(self.criterion(pred, target))

七、训练函数

此处参数已经封装在config中

def train(tr_set, dv_set, model, config, device):''' DNN training '''n_epochs = config['n_epochs']  # Maximum number of epochs# Setup optimizeroptimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas'])min_mse = 1000.loss_record = {'train': [], 'dev': []}      # for recording training lossearly_stop_cnt = 0epoch = 0while epoch < n_epochs:model.train()                           # set model to training modefor x, y in tr_set:                     # iterate through the dataloaderoptimizer.zero_grad()               # set gradient to zerox, y = x.to(device), y.to(device)   # move data to device (cpu/cuda)pred = model(x)  # forward pass (compute output)mse_loss = model.cal_loss(pred, y)  # compute lossmse_loss.backward()                 # compute gradient (backpropagation)optimizer.step()                    # update model with optimizerloss_record['train'].append(mse_loss.detach().cpu().item())# After each epoch, test your model on the validation (development) set.dev_mse = dev(dv_set, model, device)if dev_mse < min_mse:# Save model if your model improvedmin_mse = dev_mseprint('Saving model (epoch = {:4d}, loss = {:.4f})'.format(epoch + 1, min_mse))torch.save(model.state_dict(), config['save_path'])  # Save model to specified pathearly_stop_cnt = 0else:early_stop_cnt += 1epoch += 1loss_record['dev'].append(dev_mse)if early_stop_cnt > config['early_stop']:# Stop training if your model stops improving for "config['early_stop']" epochs.breakprint('Finished training after {} epochs'.format(epoch))return min_mse, loss_record

八、测试函数

def test(tt_set, model, device):model.eval()                                # set model to evalutation modepreds = []for x in tt_set:                            # iterate through the dataloaderx = x.to(device)                        # move data to device (cpu/cuda)with torch.no_grad():                   # disable gradient calculationpred = model(x)                     # forward pass (compute output)preds.append(pred.detach().cpu())   # collect predictionpreds = torch.cat(preds, dim=0).numpy()     # concatenate all predictions and convert to a numpy arrayreturn preds

九、参数设置

device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/
target_only = False                   # TODO: Using 40 states & 2 tested_positive features# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {'n_epochs': 3000,                # maximum number of epochs'batch_size': 270,               # mini-batch size for dataloader'optimizer': 'Adam',              # optimization algorithm (optimizer in torch.optim)'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)'lr': 0.001,                 # learning rate of SGD#'momentum': 0.9              # momentum for SGD},'early_stop': 200,               # early stopping epochs (the number epochs since your model's last improvement)'save_path': 'models/model.pth'  # your model will be saved here
}

十、训练过程

r_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)model = NeuralNet(tr_set.dataset.dim).to(device)  # Construct model and move to devicemodel_loss, model_loss_record = train(tr_set, dv_set, model, config, device)plot_learning_curve(model_loss_record, title='deep model')del model
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu')  # Load your best model
model.load_state_dict(ckpt)
plot_pred(dv_set, model, device)  # Show prediction on the validation set#测试集训练
def save_pred(preds, file):''' Save predictions to specified file '''print('Saving results to {}'.format(file))with open(file, 'w') as fp:writer = csv.writer(fp)writer.writerow(['id', 'tested_positive'])for i, p in enumerate(preds):writer.writerow([i, p])preds = test(tt_set, model, device)  # predict COVID-19 cases with your model
save_pred(preds, 'pred.csv')

学习曲线，以及预测值线性关系