前言
源码版本是B站UP:霹雳啪啦的yolov3版本
https://github.com/WZMIAOMIAO/deep-learning-for-image-processing
主要讲解NMS,scale_coords,draw_box三个部分的源码解析
NMS源码我单独发了一篇博客:YoloV3-SPP NMS源码详解
预测模块
源码
import os
import json
import time
import torch
import cv2
import numpy as np
from matplotlib import pyplot as plt
from build_utils import img_utils, torch_utils, utils
from models import Darknet
from draw_box_utils import draw_box
def main():img_size = 512 # 必须是32的整数倍 [416, 512, 608]cfg = "cfg/my_yolov3.cfg" # 改成生成的.cfg文件weights = "weights/629cls2best.pt" # 改成自己训练好的权重文件json_path = "./WiderPerson/my_yolo_dataset/pedestrian_classes.json" # json标签文件img_path = "test.jpg"assert os.path.exists(cfg), "cfg file {} dose not exist.".format(cfg)assert os.path.exists(weights), "weights file {} dose not exist.".format(weights)assert os.path.exists(json_path), "json file {} dose not exist.".format(json_path)assert os.path.exists(img_path), "image file {} dose not exist.".format(img_path)json_file = open(json_path, 'r')class_dict = json.load(json_file)category_index = {
v: k for k, v in class_dict.items()}input_size = (img_size, img_size)device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")model = Darknet(cfg, img_size)model.load_state_dict(torch.load(weights, map_location=device)["model"])model.to(device)# 禁止网络进行梯度跟踪model.eval()with torch.no_grad():# init 传入空图进行初始化模型载入img = torch.zeros((1, 3, img_size, img_size), device=device)model(img)img_o = cv2.imread(img_path) # BGRassert img_o is not None, "Image Not Found " + img_path# 输入进行缩放,auto可以补图像空缺的部分img = img_utils.letterbox(img_o, new_shape=input_size, auto=True, color=(0, 0, 0))[0]# Convert# img[:,:,::-1]改变了BGR-》RGB,transpose改变数据通道顺序,将416X416X3改变为3X416X416img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416# 图片设置内存存储状态为连续存储状态img = np.ascontiguousarray(img)# 图片转化为tensor格式img = torch.from_numpy(img).to(device).float()img /= 255.0 # scale (0, 255) to (0, 1)# 新增batch维度img = img.unsqueeze(0) # add batch dimension# 网络进行正向传播,t为时间差,pred为返回结果t1 = torch_utils.time_synchronized()pred = model(img)[0] # only get inference resultt2 = torch_utils.time_synchronized()print(t2 - t1)# 非极大值抑制处理pred = utils.non_max_suppression(pred, conf_thres=0.1, iou_thres=0.6, multi_label=True)[0]t3 = time.time()print(t3 - t2)if pred is None:print("No target detected.")exit(0)# process detections# 将得到的预测数据,预测边界框映射到原尺度大小pred[:, :4] = utils.scale_coords(img.shape[2:], pred[:, :4], img_o.shape).round()print(pred.shape)# 取前4个坐标参数bboxes = pred[:, :4].detach().cpu().numpy()# 取第五个confidencescores = pred[:, 4].detach().cpu().numpy()classes = pred[:, 5].detach().cpu().numpy().astype(np.int) + 1img_o = draw_box(img_o[:, :, ::-1], bboxes, classes, scores, category_index)plt.imshow(img_o)plt.show()img_o.save("test_result.jpg")
if __name__ == "__main__":main()
letter box缩放图片
源码
def letterbox(img: np.ndarray,new_shape=(416, 416),color=(114, 114, 114),auto=True,scale_fill=False,scale_up=True):"""将图片缩放调整到指定大小:param img: 输入的图像numpy格式:param new_shape: 输入网络的shape:param color: padding用什么颜色填充:param auto:原图比例不变:param scale_fill: 简单粗暴缩放到指定大小:param scale_up: false时,对于img最长边小于指定边长时,不改变img的宽高:return:"""shape = img.shape[:2] # [h, w]if isinstance(new_shape, int):new_shape = (new_shape, new_shape)# scale ratio (new / old)r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])if not scale_up: # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变r = min(r, 1.0)# compute paddingratio = r, r # width, height ratiosnew_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh paddingif auto: # minimun rectangle 保证原图比例不变,将图像最大边缩放到指定大小# 这里的取余操作可以保证padding后的图片是32的整数倍(416x416),如果是(512x512)可以保证是64的整数倍dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh paddingelif scale_fill: # stretch 简单粗暴的将图片缩放到指定尺寸dw, dh = 0, 0new_unpad = new_shaperatio = new_shape[0] / shape[1], new_shape[1] / shape[0] # wh ratiosdw /= 2 # divide padding into 2 sides 将padding分到上下,左右两侧dh /= 2# shape:[h, w] new_unpad:[w, h]if shape[::-1] != new_unpad:img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) # 计算上下两侧的paddingleft, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # 计算左右两侧的paddingimg = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add borderreturn img, ratio, (dw, dh)
解析
def letterbox(img: np.ndarray,new_shape=(416, 416),color=(114, 114, 114),auto=True,scale_fill=False,scale_up=True):"""将图片缩放调整到指定大小:param img: 输入的图像numpy格式:param new_shape: 输入网络的shape:param color: padding用什么颜色填充:param auto:原图比例不变:param scale_fill: 简单粗暴缩放到指定大小:param scale_up: false时,对于img最长边小于指定边长时,不改变img的宽高:return:"""shape = img.shape[:2] # [h, w]if isinstance(new_shape, int):new_shape = (new_shape, new_shape)
对于传进来的new_shape,判断是不是一个int,如果为int,则修改为元组,主要防止传参类型不一致。
# scale ratio (new / old)r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])if not scale_up: # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变r = min(r, 1.0)
为了形象说明上述流程,我传入一张img(h,w,3)=img(762,1019,3)的图片
这里传入的new_shape假定为(512,512),可知:
r=min(512img.h,512img.w)r=min(\frac{512}{img.h},\frac{512}{img.w})r=min(img.h512?,img.w512?)
r表示target图片和传入图片的shape的高宽比例的最小值
r=min(512img.h,512img.w)=512?min(1img.h,1img.w)=512max(img.h,img.w)r=min(\frac{512}{img.h},\frac{512}{img.w})=512\ast min(\frac{1}{img.h},\frac{1}{img.w})=\frac{512}{max(img.h,img.w)}r=min(img.h512?,img.w512?)=512?min(img.h1?,img.w1?)=max(img.h,img.w)512?
由以上分析得,r是target和传入图片的最大边的比例
# compute paddingratio = r, r # width, height ratios# 先round四舍五入保留部分小数,再int取整抛弃小数new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
new_unpad写成公式:
new_unpad=[img.w?512max(img.h,img.w),img.h?512max(img.h,img.w)]new\_unpad=[img.w\ast \frac{512}{max(img.h,img.w)},img.h\ast \frac{512}{max(img.h,img.w)}]new_unpad=[img.w?max(img.h,img.w)512?,img.h?max(img.h,img.w)512?]
写成如下形式好理解点:
new_unpad=[512?img.wmax(img.h,img.w),512?img.hmax(img.h,img.w)]new\_unpad=[512\ast \frac{img.w}{max(img.h,img.w)},512\ast \frac{img.h}{max(img.h,img.w)}]new_unpad=[512?max(img.h,img.w)img.w?,512?max(img.h,img.w)img.h?]
注:此时new_unpad表示的是shape为(1019,762)(1019,762)(1019,762)的img要scale成的初始target的shape为new_unpadnew\_unpadnew_unpad,并且保持了原图比例不变。
dw,dh写成公式:
dw=512?512?img.wmax(img.h,img.w)dw=512-512\ast \frac{img.w}{max(img.h,img.w)}dw=512?512?max(img.h,img.w)img.w?
dh=512?512?img.hmax(img.h,img.w)dh=512-512\ast \frac{img.h}{max(img.h,img.w)}dh=512?512?max(img.h,img.w)img.h?
此时dw,dh表示target的宽高边中scale_down的像素(缩小的像素)
此时target的shape(即new_shape)为:
传入图片的shape(h,w)为:
此时new_unpad为:
dw:
dh:
if auto: # minimun rectangle 保证原图比例不变,将图像最大边缩放到指定大小# 这里的取余操作可以保证padding后的图片是32的整数倍(416x416),如果是(512x512)可以保证是64的整数倍dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh paddingelif scale_fill: # stretch 简单粗暴的将图片缩放到指定尺寸dw, dh = 0, 0new_unpad = new_shaperatio = new_shape[0] / shape[1], new_shape[1] / shape[0] # wh ratios
这里采用auto=True,保持原图比例不变:
dw,dh取模64的余数,得dw=0,dh=1
注:这里模型采用的512X512指定预测,mod64是可以的,512mod64=0;但如果采用416X416指定预测,那么这个mod64就有问题了,个人觉得这是为512X512指定预测设计的参数,方便计算速度?大规模推理?这个参数值得留意,会随着模型修改需要改动。
dw /= 2 # divide padding into 2 sides 将padding分到上下,左右两侧dh /= 2
得dw=dh=0.5
# shape:[h, w] new_unpad:[w, h]if shape[::-1] != new_unpad:img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) # 计算上下两侧的paddingleft, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # 计算左右两侧的padding
如果img的宽高和new_unpad宽高不一致,那么将resize这个img从(1019,762)resize成(512,383)
其中其中一边未必是32的倍数,dh取余后是1,那么对于383,只有补充这个余数才能称为32的倍数(512已经是32倍数了)。
这个0.1是为了保证padding值是正确的,分两种情况
- 当dh或者dw为整数时,0.1没有起到作用
- 当dh或者dw为带有小数时,0.1的作用会优先让bottom相比top多padding一个像素单位,left和right类同
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add borderreturn img, ratio, (dw, dh)
调用cv2的库对img补充边界,补充像素值为(0,0,0),边界像素反向传播时不影响img其他像素的计算
返回img,ratio是target和原img最大边的比例,还有(dw,dh)
使用时目前只使用到第一个返回值img
scale_coords映射尺度
对模型经过NMS后的输出进行尺度映射,流程图简述如下:
源码
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):"""将预测的坐标信息转换回原图尺度:param img1_shape: 缩放后的图像尺度:param coords: 预测的box信息:param img0_shape: 缩放前的图像尺度:param ratio_pad: 缩放过程中的缩放比例以及pad:return:"""# Rescale coords (xyxy) from img1_shape to img0_shapeif ratio_pad is None: # calculate from img0_shapegain = max(img1_shape) / max(img0_shape) # gain = old / newpad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh paddingelse:gain = ratio_pad[0][0]pad = ratio_pad[1]coords[:, [0, 2]] -= pad[0] # x paddingcoords[:, [1, 3]] -= pad[1] # y paddingcoords[:, :4] /= gainclip_coords(coords, img0_shape)return coordsdef clip_coords(boxes, img_shape):# Clip bounding xyxy bounding boxes to image shape (height, width)boxes[:, 0].clamp_(0, img_shape[1]) # x1boxes[:, 1].clamp_(0, img_shape[0]) # y1boxes[:, 2].clamp_(0, img_shape[1]) # x2boxes[:, 3].clamp_(0, img_shape[0]) # y2
解析
# Rescale coords (xyxy) from img1_shape to img0_shapeif ratio_pad is None: # calculate from img0_shapegain = max(img1_shape) / max(img0_shape) # gain = old / newpad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
gain为放缩后最长边和原图最长边的比例,其实这个比例也是它们短边的比例,因为短边也会对根据gain这个最长边比例去缩放。这个gain和letterbox里的r是同一个值。
pad与letterbox中的dw,dh值几乎一致
coords[:, [0, 2]] -= pad[0] # x padding
对预测的横坐标恢复padding
coords[:, [1, 3]] -= pad[1] # y padding
对预测的纵坐标回复padding
coords[:, :4] /= gain
对所有坐标恢复到img0尺度
clip_coords(coords, img0_shape)
将恢复的预测坐标coords传入clip_coords函数,如下:
def clip_coords(boxes, img_shape):# Clip bounding xyxy bounding boxes to image shape (height, width)boxes[:, 0].clamp_(0, img_shape[1]) # x1boxes[:, 1].clamp_(0, img_shape[0]) # y1boxes[:, 2].clamp_(0, img_shape[1]) # x2boxes[:, 3].clamp_(0, img_shape[0]) # y2
对所有预测框坐标范围进行约束。
return coords
最后返回coords
scale_coords总结
源码没有将ratio和pad传入scale_coords函数中,当然可以实现这部分,letterbox的返回值包含了所需的参数,这里通过获得的缩放图的确可以对原图进行反求缩放比gain和pad,细心的朋友会发现,这里反求用的是缩放后的图为基准去求比值,并且这个缩放后的图是经过letterbox缩放和padding操作得到的,而在letterbox那的r比值是没有经过padding操作求得,所以数值上会有一丢丢差距,具体可以看我debug:
在letterbox那pad是(0,0.5)
draw_box
源码
def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):for i in range(boxes.shape[0]):if scores[i] > thresh:box = tuple(boxes[i].tolist()) # numpy -> list -> tupleif classes[i] in category_index.keys():class_name = category_index[classes[i]]else:class_name = 'N/A'display_str = str(class_name)display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))box_to_display_str_map[box].append(display_str)box_to_color_map[box] = STANDARD_COLORS[classes[i] % len(STANDARD_COLORS)]else:break # 网络输出概率已经排序过,当遇到一个不满足后面的肯定不满足
def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):box_to_display_str_map = collections.defaultdict(list)box_to_color_map = collections.defaultdict(str)filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)# Draw all boxes onto image.if isinstance(image, np.ndarray):image = Image.fromarray(image)draw = ImageDraw.Draw(image)im_width, im_height = image.sizefor box, color in box_to_color_map.items():xmin, ymin, xmax, ymax = box(left, right, top, bottom) = (xmin * 1, xmax * 1,ymin * 1, ymax * 1)draw.line([(left, top), (left, bottom), (right, bottom),(right, top), (left, top)], width=line_thickness, fill=color)draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)return imagedef draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):try:font = ImageFont.truetype('arial.ttf', 20)except IOError:font = ImageFont.load_default()# If the total height of the display strings added to the top of the bounding# box exceeds the top of the image, stack the strings below the bounding box# instead of above.display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]# Each display_str has a top and bottom margin of 0.05x.total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)if top > total_display_str_height:text_bottom = topelse:text_bottom = bottom + total_display_str_height# Reverse list and print from bottom to top.for display_str in box_to_display_str_map[box][::-1]:text_width, text_height = font.getsize(display_str)margin = np.ceil(0.05 * text_height)draw.rectangle([(left, text_bottom - text_height - 2 * margin),(left + text_width, text_bottom)], fill=color)draw.text((left + margin, text_bottom - text_height - margin),display_str,fill='black',font=font)text_bottom -= text_height - 2 * margin
解析
def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):box_to_display_str_map = collections.defaultdict(list)box_to_color_map = collections.defaultdict(str)
image:原图Img_o
boxes:预测框的前4个坐标x1y1x2y2
classes:预测框的类(逻辑符号从1开始计算)
scores:cls_conf类置信度
category_index:分类字典
thres:cls_conf阈值
line_thickness:box的边缘线宽像素为3
filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
对低类置信度cls_conf的预测框进行筛除,这一步只对NMS中multi_label为fasle或者单分类的预测模式有作用,因为经过NMS时在multi_label为True并且多分类时会对cls_conf进行筛选。
# Draw all boxes onto image.if isinstance(image, np.ndarray):image = Image.fromarray(image)
将ndarray对象转化为image对象
draw = ImageDraw.Draw(image)
初始化ImageDraw对象
im_width, im_height = image.sizefor box, color in box_to_color_map.items():xmin, ymin, xmax, ymax = box(left, right, top, bottom) = (xmin * 1, xmax * 1,ymin * 1, ymax * 1)draw.line([(left, top), (left, bottom), (right, bottom),(right, top), (left, top)], width=line_thickness, fill=color)
读取box_to_color_map字典在draw对象画出对应坐标的线框
draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
draw_text的代码详见:draw_text