YOLO-V3-SPP models.py详细解读_综合

文章前言

该文链接至
YOLO-V3-SPP
有兴趣请查看上文对YOLO-V3-SPP的详细解读
model流程

（霹雳啪啦UP分享的源码版本）
在这里插入图片描述
models.py

现在暂时更新到源码注释这块，后面会该model.py的流程做一个流程图
在霹雳吧啦Wz对源码的注释的基础上，我也对源码进行了详细的解析
（阅读请忽略ONNX模型的相关知识，我还没接触该ONNX模型，待使用到再进行详细注释）
from build_utils.layers import *
from build_utils.parse_config import *
from build_utils import torch_utilsONNX_EXPORT = False# 注意第一个参数包含了:，这是类型建议符，非强制
def create_modules(modules_defs: list, img_size):"""Constructs module list of layer blocks from module configuration in module_defs:param modules_defs: 通过.cfg文件解析得到的每个层结构的列表:param img_size::return:"""img_size = [img_size] * 2 if isinstance(img_size, int) else img_size# 删除解析cfg列表中的第一个配置(对应[net]的配置)modules_defs.pop(0)  # cfg training hyperparams (unused)#output_filters，每层的输入通道list，第0个是图像的输入通道，即3通道output_filters = [3]  # input channelsmodule_list = nn.ModuleList()# 统计哪些特征层的输出会被后续的层使用到(可能是特征融合，也可能是拼接)，类似残差模块routs = []  # list of layers which rout to deeper layersyolo_index = -1# 遍历搭建每个层结构,i是物理序号，从0开始，表示layer序号for i, mdef in enumerate(modules_defs):modules = nn.Sequential()if mdef["type"] == "convolutional":bn = mdef["batch_normalize"]  # 1 or 0 / use or notfilters = mdef["filters"]k = mdef["size"]  # kernel sizestride = mdef["stride"] if "stride" in mdef else (mdef['stride_y'], mdef["stride_x"])if isinstance(k, int):modules.add_module("Conv2d", nn.Conv2d(in_channels=output_filters[-1],out_channels=filters,kernel_size=k,stride=stride,padding=k // 2 if mdef["pad"] else 0,bias=not bn))else:raise TypeError("conv2d filter size must be int type.")if bn:modules.add_module("BatchNorm2d", nn.BatchNorm2d(filters))else:# 如果该卷积操作没有bn层，意味着该层为yolo的predictorrouts.append(i)  # detection output (goes into yolo layer)if mdef["activation"] == "leaky":modules.add_module("activation", nn.LeakyReLU(0.1, inplace=True))else:passelif mdef["type"] == "BatchNorm2d":passelif mdef["type"] == "maxpool":k = mdef["size"]  # kernel sizestride = mdef["stride"]modules = nn.MaxPool2d(kernel_size=k, stride=stride, padding=(k - 1) // 2)elif mdef["type"] == "upsample":if ONNX_EXPORT:  # explicitly state size, avoid scale_factorg = (yolo_index + 1) * 2 / 32  # gainmodules = nn.Upsample(size=tuple(int(x * g) for x in img_size))else:modules = nn.Upsample(scale_factor=mdef["stride"])# SPP网络结构和FPN网络结构才会出现route层elif mdef["type"] == "route":  # [-2], [-1,-3,-5,-6], [-1, 61]layers = mdef["layers"]# 如果l＞0，则需要+1，l是表示网络的第几层，而对于output_filters来说，图像也算层，而layers没有将图像作为第一层，于是将layers+1# 而当l≤0，l表示的是当前层的往后的l层，是一个偏移信息，不是表示第几层，无需+1纠正索引filters = sum([output_filters[l + 1 if l > 0 else l] for l in layers])# extend多个值在list尾部routs.extend([i + l if l < 0 else l for l in layers])# 自定义的拼接函数modules = FeatureConcat(layers=layers)# 残差模块elif mdef["type"] == "shortcut":# 注意！当前层为残差模块的下一层layers = mdef["from"]# 获得残差块最后一层卷积层的通道数，output_filters[-1]表示当前层的前一层，即残差块的最后一层filters = output_filters[-1]# routs.extend([i + l if l < 0 else l for l in layers])# i + layers[0]为残差模块前一层的索引层routs.append(i + layers[0])modules = WeightedFeatureFusion(layers=layers, weight="weights_type" in mdef)elif mdef["type"] == "yolo":# yolo_index初始值为-1yolo_index += 1  # 记录是第几个yolo_layer [0, 1, 2]stride = [32, 16, 8]  # 预测特征层对应原图的缩放比例# 使用cfg文件中anchors有很多个，选mask建议的anchorsmodules = YOLOLayer(anchors=mdef["anchors"][mdef["mask"]],  # anchor listnc=mdef["classes"],  # number of classesimg_size=img_size,stride=stride[yolo_index])# Initialize preceding Conv2d() bias (https://arxiv.org/pdf/1708.02002.pdf section 3.3)# 这里对bias的处理始终没用上，因为bias没传入module_list里面，这顿瞎操作try:# j表示YOLOlayer上一层j = -1# 最后一个module的Conv2d的偏置bias,因为最后一层的卷积核个数为255个bias_ = module_list[j][0].bias  # shape(255,) 索引0对应Sequential中的Conv2dbias = bias_.view(modules.na, -1)  # shape(3, 85)bias[:, 4] += -4.5  # obj 第5列为obj预测概率的偏置# 第6列开始的类别概率偏置bias[:, 5:] += math.log(0.6 / (modules.nc - 0.99))  # cls (sigmoid(p) = 1/nc)module_list[j][0].bias = torch.nn.Parameter(bias_, requires_grad=bias_.requires_grad)except Exception as e:print('WARNING: smart bias initialization failure.', e)else:print("Warning: Unrecognized Layer Type: " + mdef["type"])# Register module list and number of output filtersmodule_list.append(modules)output_filters.append(filters)# 生成len(modules_defs)个Fasle的listrouts_binary = [False] * len(modules_defs)# 需要记录输出的位置记为True，需要记录的输出有SPP，FPN，Resnet，Predictor模块里的各个层索引for i in routs:routs_binary[i] = Truereturn module_list, routs_binary# yolo层是接在网络三个preditor之后的层
# 在create module处调用
class YOLOLayer(nn.Module):"""对YOLO的输出进行处理"""def __init__(self, anchors, nc, img_size, stride):super(YOLOLayer, self).__init__()self.anchors = torch.Tensor(anchors)# 将numpy格式的anchors转换为tensor格式self.stride = stride  # layer stride 特征图上一步对应原图上的步距 [32, 16, 8]self.na = len(anchors)  # number of anchors (3)self.nc = nc  # number of classes (80)# 每个anchor预测参数数量self.no = nc + 5  # number of outputs (85: x, y, w, h, obj, cls1, ...)self.nx, self.ny, self.ng = 0, 0, (0, 0)  # initialize number of x, y gridpoints# 将anchors大小缩放到grid尺度self.anchor_vec = self.anchors / self.stride# view之后的维度分别代表：batch_size, na, grid_h, grid_w, wh,# 值为1的维度对应的值不是固定值，后续操作可根据broadcast广播机制自动扩充，类似reshapeself.anchor_wh = self.anchor_vec.view(1, self.na, 1, 1, 2)self.grid = Noneif ONNX_EXPORT:self.training = Falseself.create_grids((img_size[1] // stride, img_size[0] // stride))  # number x, y grid pointsdef create_grids(self, ng=(13, 13), device="cpu"):"""更新grids信息并生成新的grids参数:param ng: 特征图大小:param device::return:"""self.nx, self.ny = ngself.ng = torch.tensor(ng, dtype=torch.float)# build xy offsets 构建每个cell处的anchor的xy偏移量(在feature map上的)if not self.training:  # 训练模式不需要回归到最终预测boxes，也就不需要去计算grid# torch.arange(start=0, end=13)的结果并不包含end，类型为int64.即得到# torch.arange(num)，输出0->num-1的tensor# torch.meshgrid([0,...,ny-1],[0,...,nx-1])# yv是以[0,...,ny-1]中的每个元素为每行开头元素（即变成列，复制nx列）得到一个矩阵shape为(ny,nx)，表示整个grid的y坐标信息# xv是以[0,...,nx-1]中的每个元素为每列开头元素（即变成行，复制ny行）得到一个矩阵shape为(ny,nx)，表示整个grid的x坐标信息yv, xv = torch.meshgrid([torch.arange(self.ny, device=device),torch.arange(self.nx, device=device)])# batch_size, na, grid_h, grid_w, wh# 将xv和yv在第三个维度堆叠，生成13x13x2的shape的tensor，堆叠生成的为一个grid坐标。x坐标排列以xv顺序优先排列，y坐标排列以yv顺序排列# view添加两个维度作为batch_size和na（anchor）self.grid = torch.stack((xv, yv), 2).view((1, 1, self.ny, self.nx, 2)).float()#if self.anchor_vec.device != device:self.anchor_vec = self.anchor_vec.to(device)self.anchor_wh = self.anchor_wh.to(device)def forward(self, p):if ONNX_EXPORT:bs = 1  # batch sizeelse:bs, _, ny, nx = p.shape  # batch_size, predict_param(255), grid(13), grid(13)if (self.nx, self.ny) != (nx, ny) or self.grid is None:  # fix no grid bugself.create_grids((nx, ny), p.device)# view: (batch_size, 255, 13, 13) -> (batch_size, 3, 85, 13, 13)# permute: (batch_size, 3, 85, 13, 13) -> (batch_size, 3, 13, 13, 85)# [bs, anchor, grid, grid, xywh + obj + classes]# permute将tensor维度换位，换位之后，p在内存当中不再连续，需要调用contiguous使p在内存中连续p = p.view(bs, self.na, self.no, self.ny, self.nx).permute(0, 1, 3, 4, 2).contiguous()  # predictionif self.training:# 如果是训练模式，返回preturn pelif ONNX_EXPORT:# Avoid broadcasting for ANE operationsm = self.na * self.nx * self.ny  # 3*ng = 1. / self.ng.repeat(m, 1)grid = self.grid.repeat(1, self.na, 1, 1, 1).view(m, 2)anchor_wh = self.anchor_wh.repeat(1, 1, self.nx, self.ny, 1).view(m, 2) * ngp = p.view(m, self.no)# xy = torch.sigmoid(p[:, 0:2]) + grid # x, y# wh = torch.exp(p[:, 2:4]) * anchor_wh # width, height# p_cls = torch.sigmoid(p[:, 4:5]) if self.nc == 1 else \# torch.sigmoid(p[:, 5:self.no]) * torch.sigmoid(p[:, 4:5]) # confp[:, :2] = (torch.sigmoid(p[:, 0:2]) + grid) * ng  # x, yp[:, 2:4] = torch.exp(p[:, 2:4]) * anchor_wh  # width, heightp[:, 4:] = torch.sigmoid(p[:, 4:])p[:, 5:] = p[:, 5:self.no] * p[:, 4:5]return pelse:  # inference 如果是验证或者推理阶段# [bs, anchor, grid, grid, xywh + obj + classes]io = p.clone()  # inference output# clone返回一个张量的副本，其与原张量的尺寸和数据类型相同。# 与copy_()不同，这个函数记录在计算图中。传递到克隆张量的梯度将传播到原始张量# grid的shape=[batch_size, na, grid_h, grid_w, wh],和io最后一维取前两个xy后的shape一致，进行加法io[..., :2] = torch.sigmoid(io[..., :2]) + self.grid  # xy 计算在feature map上的xy坐标，对应论文的sigmoid(tx)+cx# anchor_wh的shape：[batch_size, na, grid_h, grid_w, wh]与io最后一维取第3，4个，即wh后的shape一致，进行乘法io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh  # wh yolo method 计算在feature map上的whio[..., :4] *= self.stride  # 换算映射回原图尺度# obj和类别预测经过sigmoidtorch.sigmoid_(io[..., 4:])return io.view(bs, -1, self.no), p  # view [1, 3, 13, 13, 85] as [1, 507, 85],3X13X13=507# p在这里的shape是[bs, anchor, grid, grid, xywh + obj + classes]class Darknet(nn.Module):"""YOLOv3 spp object detection model"""# verbose为打印开关，默认关闭def __init__(self, cfg, img_size=(416, 416), verbose=False):super(Darknet, self).__init__()# 这里传入的img_size只在导出ONNX模型时起作用，isinstance判断img_size是否为int变量，返回布尔值，和ONNX模型有关self.input_size = [img_size] * 2 if isinstance(img_size, int) else img_size# 解析网络对应的.cfg文件，返回module字典self.module_defs = parse_model_cfg(cfg)# 根据解析的网络结构一层一层去搭建，调用create_modulesself.module_list, self.routs = create_modules(self.module_defs, img_size)# 获取所有YOLOLayer层的索引89，101，113self.yolo_layers = get_yolo_layers(self)# 打印下模型的信息，如果verbose为True则打印详细信息self.info(verbose) if not ONNX_EXPORT else None  # print model description# x为输入数据def forward(self, x, verbose=False):return self.forward_once(x, verbose=verbose)def forward_once(self, x, verbose=False):# yolo_out收集每个yolo_layer层的输出# out收集每个需要保存的模块的输出，即routs记录的模块yolo_out, out = [], []if verbose:print('0', x.shape)str = ""for i, module in enumerate(self.module_list):name = module.__class__.__name__if name in ["WeightedFeatureFusion", "FeatureConcat"]:  # sum, concatif verbose:# 属于SPP或者FPN或者resnet模块module类有layers属性，layers为相对索引# 从当前i层的前一层计算layers相对索引l列表l = [i - 1] + module.layers  # layers# 对x的shape和需要进行融合和层的shape进行想加，但特征融合，即WeightedFeatureFusion的shape并不是加起来的，而是不变的。这里代码可能有点小问题sh = [list(x.shape)] + [list(out[i].shape) for i in module.layers]  # shapes# x为一对（l,sh）str = ' >> ' + ' + '.join(['layer %g %s' % x for x in zip(l, sh)])# 这里module传入两个参数，因为FPN和SPP和resnet模块的module类的forward有两个参数x，outx = module(x, out)  # WeightedFeatureFusion(), FeatureConcat()elif name == "YOLOLayer":yolo_out.append(module(x))else:  # run module directly, i.e. mtype = 'convolutional', 'upsample', 'maxpool', 'batchnorm2d' etc.x = module(x)# 如果rout记录该层索引是要保存的，则保存到out里,如果不需要保存，则添加一个[]元素，out的索引和网络层的索引一致out.append(x if self.routs[i] else [])if verbose:print('%g/%g %s -' % (i, len(self.module_list), name), list(x.shape), str)str = ''if self.training:  # trainreturn yolo_outelif ONNX_EXPORT:  # export# x = [torch.cat(x, 0) for x in zip(*yolo_out)]# return x[0], torch.cat(x[1:3], 1) # scores, boxes: 3780x80, 3780x4p = torch.cat(yolo_out, dim=0)# # 根据objectness虑除低概率目标# mask = torch.nonzero(torch.gt(p[:, 4], 0.1), as_tuple=False).squeeze(1)# # onnx不支持超过一维的索引（pytorch太灵活了）# # p = p[mask]# p = torch.index_select(p, dim=0, index=mask)## # 虑除小面积目标，w > 2 and h > 2 pixel# # ONNX暂不支持bitwise_and和all操作# mask_s = torch.gt(p[:, 2], 2./self.input_size[0]) & torch.gt(p[:, 3], 2./self.input_size[1])# mask_s = torch.nonzero(mask_s, as_tuple=False).squeeze(1)# p = torch.index_select(p, dim=0, index=mask_s) # width-height 虑除小目标## if mask_s.numel() == 0:# return torch.empty([0, 85])return pelse:  # inference or test# 将yolo_out输出成x和p的列表，yolo_out有3个元祖，每个元祖包含了一个# [bs,grid*grid*anchor,xywh + obj + classes]的shape数据和一个[bs,anchor,grid,grid,xywh + obj + classes]的shape数据x, p = zip(*yolo_out)  # inference output, training output# 拆分之后的x为3个元素的list，对每个元素在1维度进行cat拼接# 假设是20类别训练，grid为16x = torch.cat(x, 1)  # cat yolo outputsreturn x, pdef info(self, verbose=False):"""打印模型的信息:param verbose::return:"""torch_utils.model_info(self, verbose)def get_yolo_layers(self):"""获取网络中三个"YOLOLayer"模块对应的索引:param self::return:"""# 遍历每个module的class，这个class是YOLOLayer的类名return [i for i, m in enumerate(self.module_list) if m.__class__.__name__ == 'YOLOLayer']  # [89, 101, 113]