Yolo-V3-SPP 预测模块_综合

前言

源码版本是B站UP：霹雳啪啦的yolov3版本
https://github.com/WZMIAOMIAO/deep-learning-for-image-processing
主要讲解NMS,scale_coords,draw_box三个部分的源码解析

请添加图片描述

NMS源码我单独发了一篇博客:YoloV3-SPP NMS源码详解

预测模块

源码

import os
import json
import time
import torch
import cv2
import numpy as np
from matplotlib import pyplot as plt
from build_utils import img_utils, torch_utils, utils
from models import Darknet
from draw_box_utils import draw_box
def main():img_size = 512  # 必须是32的整数倍 [416, 512, 608]cfg = "cfg/my_yolov3.cfg"  # 改成生成的.cfg文件weights = "weights/629cls2best.pt"  # 改成自己训练好的权重文件json_path = "./WiderPerson/my_yolo_dataset/pedestrian_classes.json"  # json标签文件img_path = "test.jpg"assert os.path.exists(cfg), "cfg file {} dose not exist.".format(cfg)assert os.path.exists(weights), "weights file {} dose not exist.".format(weights)assert os.path.exists(json_path), "json file {} dose not exist.".format(json_path)assert os.path.exists(img_path), "image file {} dose not exist.".format(img_path)json_file = open(json_path, 'r')class_dict = json.load(json_file)category_index = {
    v: k for k, v in class_dict.items()}input_size = (img_size, img_size)device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")model = Darknet(cfg, img_size)model.load_state_dict(torch.load(weights, map_location=device)["model"])model.to(device)# 禁止网络进行梯度跟踪model.eval()with torch.no_grad():# init 传入空图进行初始化模型载入img = torch.zeros((1, 3, img_size, img_size), device=device)model(img)img_o = cv2.imread(img_path)  # BGRassert img_o is not None, "Image Not Found " + img_path# 输入进行缩放，auto可以补图像空缺的部分img = img_utils.letterbox(img_o, new_shape=input_size, auto=True, color=(0, 0, 0))[0]# Convert# img[:,:,::-1]改变了BGR-》RGB，transpose改变数据通道顺序，将416X416X3改变为3X416X416img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416# 图片设置内存存储状态为连续存储状态img = np.ascontiguousarray(img)# 图片转化为tensor格式img = torch.from_numpy(img).to(device).float()img /= 255.0  # scale (0, 255) to (0, 1)# 新增batch维度img = img.unsqueeze(0)  # add batch dimension# 网络进行正向传播，t为时间差，pred为返回结果t1 = torch_utils.time_synchronized()pred = model(img)[0]  # only get inference resultt2 = torch_utils.time_synchronized()print(t2 - t1)# 非极大值抑制处理pred = utils.non_max_suppression(pred, conf_thres=0.1, iou_thres=0.6, multi_label=True)[0]t3 = time.time()print(t3 - t2)if pred is None:print("No target detected.")exit(0)# process detections# 将得到的预测数据，预测边界框映射到原尺度大小pred[:, :4] = utils.scale_coords(img.shape[2:], pred[:, :4], img_o.shape).round()print(pred.shape)# 取前4个坐标参数bboxes = pred[:, :4].detach().cpu().numpy()# 取第五个confidencescores = pred[:, 4].detach().cpu().numpy()classes = pred[:, 5].detach().cpu().numpy().astype(np.int) + 1img_o = draw_box(img_o[:, :, ::-1], bboxes, classes, scores, category_index)plt.imshow(img_o)plt.show()img_o.save("test_result.jpg")
if __name__ == "__main__":main()

letter box缩放图片

源码

def letterbox(img: np.ndarray,new_shape=(416, 416),color=(114, 114, 114),auto=True,scale_fill=False,scale_up=True):"""将图片缩放调整到指定大小:param img: 输入的图像numpy格式:param new_shape: 输入网络的shape:param color: padding用什么颜色填充:param auto:原图比例不变:param scale_fill: 简单粗暴缩放到指定大小:param scale_up: false时，对于img最长边小于指定边长时，不改变img的宽高:return:"""shape = img.shape[:2]  # [h, w]if isinstance(new_shape, int):new_shape = (new_shape, new_shape)# scale ratio (new / old)r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])if not scale_up:  # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变r = min(r, 1.0)# compute paddingratio = r, r  # width, height ratiosnew_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh paddingif auto:  # minimun rectangle 保证原图比例不变，将图像最大边缩放到指定大小# 这里的取余操作可以保证padding后的图片是32的整数倍(416x416)，如果是(512x512)可以保证是64的整数倍dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh paddingelif scale_fill:  # stretch 简单粗暴的将图片缩放到指定尺寸dw, dh = 0, 0new_unpad = new_shaperatio = new_shape[0] / shape[1], new_shape[1] / shape[0]  # wh ratiosdw /= 2  # divide padding into 2 sides 将padding分到上下，左右两侧dh /= 2# shape:[h, w] new_unpad:[w, h]if shape[::-1] != new_unpad:img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))  # 计算上下两侧的paddingleft, right = int(round(dw - 0.1)), int(round(dw + 0.1))  # 计算左右两侧的paddingimg = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add borderreturn img, ratio, (dw, dh)

解析

def letterbox(img: np.ndarray,new_shape=(416, 416),color=(114, 114, 114),auto=True,scale_fill=False,scale_up=True):"""将图片缩放调整到指定大小:param img: 输入的图像numpy格式:param new_shape: 输入网络的shape:param color: padding用什么颜色填充:param auto:原图比例不变:param scale_fill: 简单粗暴缩放到指定大小:param scale_up: false时，对于img最长边小于指定边长时，不改变img的宽高:return:"""shape = img.shape[:2]  # [h, w]if isinstance(new_shape, int):new_shape = (new_shape, new_shape)

对于传进来的new_shape，判断是不是一个int，如果为int，则修改为元组，主要防止传参类型不一致。

    # scale ratio (new / old)r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])if not scale_up:  # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变r = min(r, 1.0)

为了形象说明上述流程，我传入一张img(h,w,3)=img(762,1019,3)的图片
这里传入的new_shape假定为(512,512),可知：
$r=min(512img.h,512img.w)r=min(\frac{512}{img.h},\frac{512}{img.w})$
r表示target图片和传入图片的shape的高宽比例的最小值
$r=min(512img.h,512img.w)=512?min(1img.h,1img.w)=512max(img.h,img.w)r=min(\frac{512}{img.h},\frac{512}{img.w})=512\ast min(\frac{1}{img.h},\frac{1}{img.w})=\frac{512}{max(img.h,img.w)}$

由以上分析得，r是target和传入图片的最大边的比例

    # compute paddingratio = r, r  # width, height ratios# 先round四舍五入保留部分小数，再int取整抛弃小数new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

new_unpad写成公式：
$new_unpad=[img.w?512max(img.h,img.w),img.h?512max(img.h,img.w)]new\_unpad=[img.w\ast \frac{512}{max(img.h,img.w)},img.h\ast \frac{512}{max(img.h,img.w)}]$
写成如下形式好理解点：
$new_unpad=[512?img.wmax(img.h,img.w),512?img.hmax(img.h,img.w)]new\_unpad=[512\ast \frac{img.w}{max(img.h,img.w)},512\ast \frac{img.h}{max(img.h,img.w)}]$
注：此时new_unpad表示的是shape为 $(1019, 762)$ 的img要scale成的初始target的shape为 $new\_unpad$ ，并且保持了原图比例不变。
dw,dh写成公式：
$dw=512?512?img.wmax(img.h,img.w)dw=512-512\ast \frac{img.w}{max(img.h,img.w)}$
$dh=512?512?img.hmax(img.h,img.w)dh=512-512\ast \frac{img.h}{max(img.h,img.w)}$
此时dw,dh表示target的宽高边中scale_down的像素（缩小的像素）

此时target的shape(即new_shape)为：
在这里插入图片描述
传入图片的shape(h,w)为：

此时new_unpad为：

dw:

dh：

    if auto:  # minimun rectangle 保证原图比例不变，将图像最大边缩放到指定大小# 这里的取余操作可以保证padding后的图片是32的整数倍(416x416)，如果是(512x512)可以保证是64的整数倍dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh paddingelif scale_fill:  # stretch 简单粗暴的将图片缩放到指定尺寸dw, dh = 0, 0new_unpad = new_shaperatio = new_shape[0] / shape[1], new_shape[1] / shape[0]  # wh ratios

这里采用auto=True,保持原图比例不变：
dw,dh取模64的余数，得dw=0,dh=1
注：这里模型采用的512X512指定预测，mod64是可以的，512mod64=0；但如果采用416X416指定预测，那么这个mod64就有问题了，个人觉得这是为512X512指定预测设计的参数，方便计算速度？大规模推理？这个参数值得留意，会随着模型修改需要改动。

    dw /= 2  # divide padding into 2 sides 将padding分到上下，左右两侧dh /= 2

得dw=dh=0.5

    # shape:[h, w] new_unpad:[w, h]if shape[::-1] != new_unpad:img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))  # 计算上下两侧的paddingleft, right = int(round(dw - 0.1)), int(round(dw + 0.1))  # 计算左右两侧的padding

如果img的宽高和new_unpad宽高不一致，那么将resize这个img从(1019,762)resize成(512,383)
其中其中一边未必是32的倍数，dh取余后是1，那么对于383，只有补充这个余数才能称为32的倍数（512已经是32倍数了）。

这个0.1是为了保证padding值是正确的，分两种情况

当dh或者dw为整数时，0.1没有起到作用
当dh或者dw为带有小数时，0.1的作用会优先让bottom相比top多padding一个像素单位，left和right类同

    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add borderreturn img, ratio, (dw, dh)

调用cv2的库对img补充边界，补充像素值为(0,0,0)，边界像素反向传播时不影响img其他像素的计算
返回img，ratio是target和原img最大边的比例，还有(dw,dh)
使用时目前只使用到第一个返回值img

scale_coords映射尺度

对模型经过NMS后的输出进行尺度映射，流程图简述如下：
在这里插入图片描述

源码

def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):"""将预测的坐标信息转换回原图尺度:param img1_shape: 缩放后的图像尺度:param coords: 预测的box信息:param img0_shape: 缩放前的图像尺度:param ratio_pad: 缩放过程中的缩放比例以及pad:return:"""# Rescale coords (xyxy) from img1_shape to img0_shapeif ratio_pad is None:  # calculate from img0_shapegain = max(img1_shape) / max(img0_shape)  # gain = old / newpad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh paddingelse:gain = ratio_pad[0][0]pad = ratio_pad[1]coords[:, [0, 2]] -= pad[0]  # x paddingcoords[:, [1, 3]] -= pad[1]  # y paddingcoords[:, :4] /= gainclip_coords(coords, img0_shape)return coordsdef clip_coords(boxes, img_shape):# Clip bounding xyxy bounding boxes to image shape (height, width)boxes[:, 0].clamp_(0, img_shape[1])  # x1boxes[:, 1].clamp_(0, img_shape[0])  # y1boxes[:, 2].clamp_(0, img_shape[1])  # x2boxes[:, 3].clamp_(0, img_shape[0])  # y2

解析

    # Rescale coords (xyxy) from img1_shape to img0_shapeif ratio_pad is None:  # calculate from img0_shapegain = max(img1_shape) / max(img0_shape)  # gain = old / newpad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding

gain为放缩后最长边和原图最长边的比例，其实这个比例也是它们短边的比例，因为短边也会对根据gain这个最长边比例去缩放。这个gain和letterbox里的r是同一个值。

pad与letterbox中的dw，dh值几乎一致

    coords[:, [0, 2]] -= pad[0]  # x padding

对预测的横坐标恢复padding

    coords[:, [1, 3]] -= pad[1]  # y padding

对预测的纵坐标回复padding

    coords[:, :4] /= gain

对所有坐标恢复到img0尺度

    clip_coords(coords, img0_shape)

将恢复的预测坐标coords传入clip_coords函数，如下：

def clip_coords(boxes, img_shape):# Clip bounding xyxy bounding boxes to image shape (height, width)boxes[:, 0].clamp_(0, img_shape[1])  # x1boxes[:, 1].clamp_(0, img_shape[0])  # y1boxes[:, 2].clamp_(0, img_shape[1])  # x2boxes[:, 3].clamp_(0, img_shape[0])  # y2

对所有预测框坐标范围进行约束。

    return coords

最后返回coords

scale_coords总结

源码没有将ratio和pad传入scale_coords函数中，当然可以实现这部分，letterbox的返回值包含了所需的参数，这里通过获得的缩放图的确可以对原图进行反求缩放比gain和pad，细心的朋友会发现，这里反求用的是缩放后的图为基准去求比值，并且这个缩放后的图是经过letterbox缩放和padding操作得到的，而在letterbox那的r比值是没有经过padding操作求得，所以数值上会有一丢丢差距，具体可以看我debug:
在这里插入图片描述
在letterbox那pad是(0,0.5)

draw_box

源码

def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):for i in range(boxes.shape[0]):if scores[i] > thresh:box = tuple(boxes[i].tolist())  # numpy -> list -> tupleif classes[i] in category_index.keys():class_name = category_index[classes[i]]else:class_name = 'N/A'display_str = str(class_name)display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))box_to_display_str_map[box].append(display_str)box_to_color_map[box] = STANDARD_COLORS[classes[i] % len(STANDARD_COLORS)]else:break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):box_to_display_str_map = collections.defaultdict(list)box_to_color_map = collections.defaultdict(str)filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)# Draw all boxes onto image.if isinstance(image, np.ndarray):image = Image.fromarray(image)draw = ImageDraw.Draw(image)im_width, im_height = image.sizefor box, color in box_to_color_map.items():xmin, ymin, xmax, ymax = box(left, right, top, bottom) = (xmin * 1, xmax * 1,ymin * 1, ymax * 1)draw.line([(left, top), (left, bottom), (right, bottom),(right, top), (left, top)], width=line_thickness, fill=color)draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)return imagedef draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):try:font = ImageFont.truetype('arial.ttf', 20)except IOError:font = ImageFont.load_default()# If the total height of the display strings added to the top of the bounding# box exceeds the top of the image, stack the strings below the bounding box# instead of above.display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]# Each display_str has a top and bottom margin of 0.05x.total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)if top > total_display_str_height:text_bottom = topelse:text_bottom = bottom + total_display_str_height# Reverse list and print from bottom to top.for display_str in box_to_display_str_map[box][::-1]:text_width, text_height = font.getsize(display_str)margin = np.ceil(0.05 * text_height)draw.rectangle([(left, text_bottom - text_height - 2 * margin),(left + text_width, text_bottom)], fill=color)draw.text((left + margin, text_bottom - text_height - margin),display_str,fill='black',font=font)text_bottom -= text_height - 2 * margin

解析

def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):box_to_display_str_map = collections.defaultdict(list)box_to_color_map = collections.defaultdict(str)

image：原图Img_o
boxes：预测框的前4个坐标x1y1x2y2
classes：预测框的类（逻辑符号从1开始计算）
scores：cls_conf类置信度
category_index：分类字典
thres：cls_conf阈值
line_thickness：box的边缘线宽像素为3

    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)

对低类置信度cls_conf的预测框进行筛除，这一步只对NMS中multi_label为fasle或者单分类的预测模式有作用，因为经过NMS时在multi_label为True并且多分类时会对cls_conf进行筛选。

    # Draw all boxes onto image.if isinstance(image, np.ndarray):image = Image.fromarray(image)

将ndarray对象转化为image对象

    draw = ImageDraw.Draw(image)

初始化ImageDraw对象

    im_width, im_height = image.sizefor box, color in box_to_color_map.items():xmin, ymin, xmax, ymax = box(left, right, top, bottom) = (xmin * 1, xmax * 1,ymin * 1, ymax * 1)draw.line([(left, top), (left, bottom), (right, bottom),(right, top), (left, top)], width=line_thickness, fill=color)

读取box_to_color_map字典在draw对象画出对应坐标的线框

        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)

draw_text的代码详见：draw_text