from __future__ import division
import time
import torch 
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import cv2 
from util import *
import argparse
import os 
import os.path as osp
from darknet import Darknet
import pickle as pkl
import pandas as pd
import random

1 创建命令行声明


def arg_parse():"""Parse arguements to the detect module"""parser = argparse.ArgumentParser(description='YOLO v3 Detection Module')parser.add_argument("--images", dest = 'images', help = "Image / Directory containing images to perform detection upon",default = "imgs", type = str)parser.add_argument("--det", dest = 'det', help = "Image / Directory to store detections to",default = "det", type = str)parser.add_argument("--bs", dest = "bs", help = "Batch size", default = 1)parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5)parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4)parser.add_argument("--cfg", dest = 'cfgfile', help = "Config file",default = "cfg/yolov3.cfg", type = str)parser.add_argument("--weights", dest = 'weightsfile', help = "weightsfile",default = "yolov3.weights", type = str)parser.add_argument("--reso", dest = 'reso', help = "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",default = "416", type = str)return parser.parse_args()args = arg_parse()
images = args.images
batch_size = int(args.bs)
confidence = float(args.confidence)
nms_thesh = float(args.nms_thresh)
start = 0
CUDA = torch.cuda.is_available()


2 加载网络


mkdir data
cd data
wget https://raw.githubusercontent.com/ayooshkathuria/YOLO_v3_tutorial_from_scratch/master/data/coco.names


num_classes = 80    #For COCO
classes = load_classes("data/coco.names")


def load_classes(namesfile):fp = open(namesfile, "r")names = fp.read().split("\n")[:-1]return names


#Set up the neural network
print("Loading network.....")
model = Darknet(args.cfgfile)
print("Network successfully loaded")model.net_info["height"] = args.reso
inp_dim = int(model.net_info["height"])
assert inp_dim % 32 == 0 
assert inp_dim > 32#If there's a GPU availible, put the model on GPU
if CUDA:model.cuda()#Set the model in evaluation mode

3 读取输入图片


read_dir = time.time()
#Detection phase
try:imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)]
except NotADirectoryError:imlist = []imlist.append(osp.join(osp.realpath('.'), images))
except FileNotFoundError:print ("No file or directory with the name {}".format(images))exit()


if not os.path.exists(args.det):os.makedirs(args.det)


load_batch = time.time()
loaded_ims = [cv2.imread(x) for x in imlist]

load_batch又是一个检查点。OpenCV以一个numpy数组的形式加载一个图片,颜色通道的顺序是BGR。PyTorch的图片输入格式是(Batches * Channels * Height * Width),颜色通道是RGB。因此,我们在util.py中编写prep_image函数来讲numpy数组转换成一个PyTorch的输入格式。在编写这个函数之前,我们必须编写一个letterbox_image的函数来调整我们的图片,保持纵横比不变,在剩余的部分填充(128,128,128)的颜色。

def letterbox_image(img, inp_dim):'''resize image with unchanged aspect ratio using padding'''img_w, img_h = img.shape[1], img.shape[0]w, h = inp_dimnew_w = int(img_w * min(w/img_w, h/img_h))new_h = int(img_h * min(w/img_w, h/img_h))resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w,  :] = resized_imagereturn canvas


def prep_image(img, inp_dim):"""Prepare image for inputting to the neural network. Returns a Variable """img = cv2.resize(img, (inp_dim, inp_dim))img = img[:,:,::-1].transpose((2,0,1)).copy()img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)return img


#PyTorch Variables for images
im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))]))#List containing dimensions of original images
im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims]
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)if CUDA:im_dim_list = im_dim_list.cuda()

4 创建批量

leftover = 0
if (len(im_dim_list) % batch_size):leftover = 1if batch_size != 1:num_batches = len(imlist) // batch_size + leftover            im_batches = [torch.cat((im_batches[i*batch_size : min((i +  1)*batch_size,len(im_batches))]))  for i in range(num_batches)]  

5 检测循环



write = 0
start_det_loop = time.time()
for i, batch in enumerate(im_batches):#load the image start = time.time()if CUDA:batch = batch.cuda()prediction = model(Variable(batch, volatile = True), CUDA)prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thesh)end = time.time()if type(prediction) == int:for im_num, image in enumerate(imlist[i*batch_size: min((i +  1)*batch_size, len(imlist))]):im_id = i*batch_size + im_numprint("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))print("{0:20s} {1:s}".format("Objects Detected:", ""))print("----------------------------------------------------------")continueprediction[:,0] += i*batch_size    #transform the atribute from index in batch to index in imlist if not write:                      #If we have't initialised outputoutput = prediction  write = 1else:output = torch.cat((output,prediction))for im_num, image in enumerate(imlist[i*batch_size: min((i +  1)*batch_size, len(imlist))]):im_id = i*batch_size + im_numobjs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size))print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs)))print("----------------------------------------------------------")if CUDA:torch.cuda.synchronize()   

这个代码行torch.cuda.synchronize确保cuda的内核和CPU的保持同步。否则,CUDA内核会在GPU作业进入队列和GPU作业完成之前(异步调用)将控制权返回给CPU。如果end = time()在GPU作业实际结束之前被打印出来,那么这可能会导致错误的时间。现在,我们已经让所有图片的检测都在我们的输出张量中。让我们在图片上画边界框吧。

6 在图片上绘制边界框


except NameError:print ("No detections were made")exit()


im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())scaling_factor = torch.min(inp_dim/im_dim_list,1)[0].view(-1,1)output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2

现在,我们的坐标服从在填充区域上的我们图片的尺寸。但是,在letterbox_image函数中,我们通过缩放因子(scaling factor)调整我们图片的尺寸(记住所有尺寸都被相同的因子所除去维持宽高比)。我们现在撤销这个缩放因子来得到在原始图片上的边界框的坐标。

output[:,1:5] /= scaling_factor


for i in range(output.shape[0]):output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])


class_load = time.time()
colors = pkl.load(open("pallete", "rb"))


draw = time.time()def write(x, results, color):c1 = tuple(x[1:3].int())c2 = tuple(x[3:5].int())img = results[int(x[0])]cls = int(x[-1])label = "{0}".format(classes[cls])cv2.rectangle(img, c1, c2,color, 1)t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4cv2.rectangle(img, c1, c2,color, -1)cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);return img


list(map(lambda x: write(x, loaded_ims), output))


det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(args.det,x.split("/")[-1]))


list(map(cv2.imwrite, det_names, loaded_ims))
end = time.time()

7 打印时间汇总


print("{:25s}: {}".format("Task", "Time Taken (in seconds)"))
print("{:25s}: {:2.3f}".format("Reading addresses", load_batch - read_dir))
print("{:25s}: {:2.3f}".format("Loading batch", start_det_loop - load_batch))
print("{:25s}: {:2.3f}".format("Detection (" + str(len(imlist)) +  " images)", output_recast - start_det_loop))
print("{:25s}: {:2.3f}".format("Output Processing", class_load - output_recast))
print("{:25s}: {:2.3f}".format("Drawing Boxes", end - draw))
print("{:25s}: {:2.3f}".format("Average time_per_img", (end - load_batch)/len(imlist)))

8 检测目标检测器

python detect.py --images dog-cycle-car.png --det det


(下面的代码是在cpu上运行(is run on),期望在gpu上跑的更快,在一个gpu tesla K80上大约0.1秒/图片)

Loading network.....
Network successfully loaded
dog-cycle-car.png    predicted in  2.456 seconds
Objects Detected:    bicycle truck dog
Task                     : Time Taken (in seconds)Reading addresses        : 0.002
Loading batch            : 0.120
Detection (1 images)     : 2.457
Output Processing        : 0.002
Drawing Boxes            : 0.076
Average time_per_img     : 2.657


9 在视频/网络摄像头上运行检测器



videofile = "video.avi" #or path to the video file. cap = cv2.VideoCapture(videofile)  #cap = cv2.VideoCapture(0)  for webcamassert cap.isOpened(), 'Cannot capture source'frames = 0



frames = 0  
start = time.time()while cap.isOpened():ret, frame = cap.read()if ret:   img = prep_image(frame, inp_dim)
#        cv2.imshow("a", frame)im_dim = frame.shape[1], frame.shape[0]im_dim = torch.FloatTensor(im_dim).repeat(1,2)   if CUDA:im_dim = im_dim.cuda()img = img.cuda()output = model(Variable(img, volatile = True), CUDA)output = write_results(output, confidence, num_classes, nms_conf = nms_thesh)if type(output) == int:frames += 1print("FPS of the video is {:5.4f}".format( frames / (time.time() - start)))cv2.imshow("frame", frame)key = cv2.waitKey(1)if key & 0xFF == ord('q'):breakcontinueoutput[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(inp_dim))im_dim = im_dim.repeat(output.size(0), 1)/inp_dimoutput[:,1:5] *= im_dimclasses = load_classes('data/coco.names')colors = pkl.load(open("pallete", "rb"))list(map(lambda x: write(x, frame), output))cv2.imshow("frame", frame)key = cv2.waitKey(1)if key & 0xFF == ord('q'):breakframes += 1print(time.time() - start)print("FPS of the video is {:5.2f}".format( frames / (time.time() - start)))else:break  

10 总结


In this series of tutorials, we have implemented an object detector from scratch, and cheers for reaching this far. I still think being able to churn out efficient code is one of the most underrated skills a deep learning practitioner can have. However revolutionary your idea you maybe, it's of no use unless you can test it. For that, you need to have strong coding skills.

 I've also learned that the the best way to learn about any topic in deep learning is to implement code. It forces you to glance over the minute yet fundamental subtleties of a topic that you may miss out on when you're reading a paper. I hope this tutorial series has served as an exercise in honing your skills as a deep learning practitioner.
