OpenCV实战——使用YOLO进行目标检测

0. 前言

在本节中，我们将使用 YOLO 算法执行目标检测。目标检测是计算机视觉中的一项常见任务，借助深度学习技术，我们可以实现高准确度的检测。YOLO 在 COCO 数据集(数据集中包含 80 个类别和超过 300000 张图像)中可以达到 60.6mAP (20 fps) 或 33mAP (220 fps)。

1. YOLO 模型简介

YOLO 是深度学习网络目标检测的一类重要分枝，其将输入图像划分为 SxS 网格。对于每个网格，YOLO 检查 B 个边界框，然后深度学习模型提取每个网格的边界框、包含可能对象的置信度以及每个边界框中(训练数据集中)每个类别的置信度：

OpenCV实战——使用YOLO进行目标检测-LMLPHP
YOLO 使用 19x19 个网格，每个网格包含 5 个边界框，训练数据集中包含 80 个类别。网络的输出结果为 19x19x425，其中 425 来自边界框 (x,y,width,height)、边界框中是否包含对象的置信度、对象属于每个类别(共 80 个类别)的置信度：

5_bounding box*(x,y,w,h,object_confidence,classify_confidence[80])=5*(4 + 1 + 80)

YOLO 架构基于 DarkNet (包含 53 层网络)，YOLO 在 DarkNet 的基础上增加了 53 层网络，共 106 层网络。如果我们需要预测速度更快的架构，可以使用包含较少网络层 TinyYOLO 架构。

2. 基于 YOLO 实现目标检测

在本节中，我们使用与深度学习简介一节相同的函数和类来加载模型、预处理图像和预测结果，同时介绍非极大值抑制 (non-maximum suppression, NMS)，以及绘制带有标签的预测结果：

(1) 创建 object_detection_yolo.cpp 文件，导入所需的头文件，初始化所需的全局变量：

#include <fstream>
#include <sstream>
#include <iostream>

#include <opencv2/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

using namespace cv;
using namespace dnn;
using namespace std;

// Initialize the parameters
float confThreshold = 0.5; // Confidence threshold
float nmsThreshold = 0.4;  // Non-maximum suppression threshold
int inpWidth = 416;  // Width of network's input image
int inpHeight = 416; // Height of network's input image
vector<string> classes;

(2) 我们从 main 函数开始，首先读取存储模型可以预测的所有类别的文件：

int main(int argc, char** argv) {
    // 加载类别名
    string classesFile = "data/coco.names";
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classes.push_back(line);

(3) 使用模型定义和权重文件加载模型：

    // 提供模型的配置和权重文件
    String modelConfiguration = "data/yolov3.cfg";
    String modelWeights = "data/yolov3.weights";
    // 加载网络
    Net net = readNetFromDarknet(modelConfiguration, modelWeights);

(4) 加载图像并将其转换为 blob：

    Mat input, blob;
    input= imread(argv[1]);
    if (input.empty()) {
        cout << "No input image" << endl;
        return 0;
    }
    // 创建输入
    blobFromImage(input, blob, 1/255.0, Size(inpWidth, inpHeight), Scalar(0,0,0), true, false);

(5) 使用 setInput 和 forward 函数检测所有对象及其类别：

    // 设定网络输入
    net.setInput(blob);
    // 执行前向传播
    vector<Mat> outs;
    net.forward(outs, getOutputsNames(net));

(6) 对输出结果进行后处理，绘制检测到的目标及预测置信度：

    // 移除低置信度边界框
    postprocess(input, outs);

(7) 在 postprocess 函数中，存储所有预测置信度高于 confThreshold 的边界框框：

    vector<int> classIds;
    vector<float> confidences;
    vector<Rect> boxes;
    for (size_t i = 0; i < outs.size(); ++i) {
        // 扫描网络输出的所有边界框，仅保留具有高置信度分数的边界框
        // 将边界框的类标签指定为边界框得分最高的类别
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            // 获取最大分数的值和位置
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold) {
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;
                
                classIds.push_back(classIdPoint.x);
                confidences.push_back((float)confidence);
                boxes.push_back(Rect(left, top, width, height));
            }
        }
    }

(8) 使用 NMSBoxes 函数应用非极大值抑制，只得到具有高置信度的非重叠边界框并进行绘制：

    // 执行非极大值抑制
    // 消除具有较低置信度的冗余重叠边界框
    vector<int> indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i) {
        int idx = indices[i];
        Rect box = boxes[idx];
        drawPred(classIds[idx], confidences[idx], box.x, box.y,
                 box.x + box.width, box.y + box.height, frame);
    }

使用 YOLO 执行目标检测的结果如下所示：

OpenCV实战——使用YOLO进行目标检测-LMLPHP

3. 完整代码

完整代码 object_detection_yolo.cpp 如下所示：

#include <fstream>
#include <sstream>
#include <iostream>

#include <opencv2/core/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;
using namespace dnn;
using namespace std;

// 初始化参数
float confThreshold = 0.5;  // 置信度阈值
float nmsThreshold = 0.4;   // 非极大值抑制阈值
int inpWidth = 416;         // 网络输入图像宽度
int inpHeight = 416;        // 网络输入图像高度
vector<string> classes;

// 绘制预测边界框
void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) {
    // 绘制显示边界框矩形
    rectangle(frame, Point(left, top), Point(right, bottom), Scalar(255, 255, 255), 1);
    // 获取类别名的标签及其置信度
    string conf_label = format("%.2f", conf);
    string label="";
    if (!classes.empty()) {
        label = classes[classId] + ":" + conf_label;
    }
    // 在边界框顶部显示标签
    int baseLine;
    Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
    top = max(top, labelSize.height);
    rectangle(frame, Point(left, top - labelSize.height), Point(left + labelSize.width, top + baseLine), Scalar(255, 255, 255), FILLED);
    putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0,0,0),1,LINE_AA);
}

// 使用非最大值抑制移除置信度低的边界框
void postprocess(Mat& frame, const vector<Mat>& outs) {
    vector<int> classIds;
    vector<float> confidences;
    vector<Rect> boxes;
    for (size_t i = 0; i < outs.size(); ++i) {
        // 扫描网络输出的所有边界框，仅保留具有高置信度分数的边界框
        // 将边界框的类标签指定为边界框得分最高的类别
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            // 获取最大分数的值和位置
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold) {
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;
                
                classIds.push_back(classIdPoint.x);
                confidences.push_back((float)confidence);
                boxes.push_back(Rect(left, top, width, height));
            }
        }
    }
    
    // 执行非极大值抑制
    // 消除具有较低置信度的冗余重叠边界框
    vector<int> indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i) {
        int idx = indices[i];
        Rect box = boxes[idx];
        drawPred(classIds[idx], confidences[idx], box.x, box.y,
                 box.x + box.width, box.y + box.height, frame);
    }
}

// 获取输出层的名称
vector<String> getOutputsNames(const Net& net) {
    static vector<String> names;
    if (names.empty()) {
        // 获取输出层的索引
        vector<int> outLayers = net.getUnconnectedOutLayers();
        // 获取网络中所有层的名称
        vector<String> layersNames = net.getLayerNames();
        // 获取names变量中输出层的名称
        names.resize(outLayers.size());
        for (size_t i = 0; i < outLayers.size(); ++i) {
            names[i] = layersNames[outLayers[i] - 1];
        }
    }
    return names;
}

int main(int argc, char** argv) {
    // 加载类别名
    string classesFile = "data/coco.names";
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classes.push_back(line);
    // 提供模型的配置和权重文件
    String modelConfiguration = "data/yolov3.cfg";
    String modelWeights = "data/yolov3.weights";
    // 加载网络
    Net net = readNetFromDarknet(modelConfiguration, modelWeights);
    net.setPreferableBackend(DNN_BACKEND_OPENCV);
    net.setPreferableTarget(DNN_TARGET_CPU);
    
    Mat input, blob;
    input= imread(argv[1]);
    if (input.empty()) {
        cout << "No input image" << endl;
        return 0;
    }
    // 创建输入
    blobFromImage(input, blob, 1/255.0, Size(inpWidth, inpHeight), Scalar(0,0,0), true, false);
    // 设定网络输入
    net.setInput(blob);
    // 执行前向传播
    vector<Mat> outs;
    net.forward(outs, getOutputsNames(net));
    // 移除低置信度边界框
    postprocess(input, outs);
    vector<double> layersTimes;
    double freq = getTickFrequency() / 1000;
    double t = net.getPerfProfile(layersTimes) / freq;
    string label = format("Inference time for compute the image : %.2f ms", t);
    cout << label << endl;
    
    imshow("YOLOv3", input);
    waitKey(0);
    return 0;
}

盼小辉丶