修改YOLOv5 detect.py代码使其能逐个视频检测保存,同时对每个视频内参数进行单独操作

Posted Xiashawuyanzu

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了修改YOLOv5 detect.py代码使其能逐个视频检测保存,同时对每个视频内参数进行单独操作相关的知识,希望对你有一定的参考价值。

真没怎么看懂YOLOv5的detect.py代码的逻辑,看了YOLOv3,和YOLOv4的detect逻辑,基本都是用opencv对每个视频进行操作,感觉还清晰易懂一点,YOLOv5的作者都好像没用opencv进行操作,或者把opencv的视频操作封装成另一个py文件隐藏起来,实在有些隐晦,所以用了最笨的方法,用os.listdir读视频文件目录下的所有视频,逐一检测。同时改写了画框的函数(因为要保存一帧关键帧的内容),检测命令里是用python detect.py --exist-ok --nosave,因为检测命令里带nosave这一选项,所以浅扒了一下作者的画框逻辑,发现还是用的opencv的rectangle方法(作者藏的

import numpy as np
import argparse
import os
import sys
from pathlib import Path
import time
import shutil
from PIL import Image
import cv2
import torch
import torch.backends.cudnn as cudnn

FILE = Path(__file__).resolve()
ROOT = FILE.parents[0]  # YOLOv5 root directory
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative

from models.common import DetectMultiBackend
from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
                           increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
from utils.plots import Annotator, colors, save_one_box
from utils.torch_utils import select_device, time_sync


@torch.no_grad()
def run(weights=ROOT / 'yolov5s.pt',  # model.pt path(s)
        vidpath='/home/ccf_disk/animal/test/',  # file/dir/URL/glob, 0 for webcam
        data=ROOT / 'data/coco128.yaml',  # dataset.yaml path
        imgsz=(640, 640),  # inference size (height, width)
        conf_thres=0.6,  # confidence threshold
        iou_thres=0.45,  # NMS IOU threshold
        max_det=1000,  # maximum detections per image
        device='',  # cuda device, i.e. 0 or 0,1,2,3 or cpu
        view_img=False,  # show results
        save_txt=False,  # save results to *.txt
        save_conf=False,  # save confidences in --save-txt labels
        save_crop=False,  # save cropped prediction boxes
        nosave=True,  # do not save images/videos
        classes=None,  # filter by class: --class 0, or --class 0 2 3
        agnostic_nms=False,  # class-agnostic NMS
        augment=False,  # augmented inference
        visualize=False,  # visualize features
        update=False,  # update all models
        project='/home/ccf_disk/animal/video_animal',  # save results to project/name
        name='test_1',  # save results to project/name
        exist_ok=True,  # existing project/name ok, do not increment
        line_thickness=3,  # bounding box thickness (pixels)
        hide_labels=False,  # hide labels
        hide_conf=False,  # hide confidences
        half=False,  # use FP16 half-precision inference
        dnn=False,  # use OpenCV DNN for ONNX inference
        ):
    vidpath = str(vidpath)
    videos = os.listdir(vidpath)
    number = 0
    for video_name in videos:
        time1_start = time.time()
        so = vidpath + video_name
        number = number + 1
        print("第%d个视频处理中" %number)
        source = str(so)
        save_c = 0
        keep = 0
        save_img = not nosave and not source.endswith('.txt')  # save inference images
        is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
        is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))
        webcam = source.isnumeric() or source.endswith('.txt') or (is_url and not is_file)
        if is_url and is_file:
            source = check_file(source)  # download

        # Directories
        save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run
        (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

        # Load model
        device = select_device(device)
        model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data)
        stride, names, pt, jit, onnx, engine = model.stride, model.names, model.pt, model.jit, model.onnx, model.engine
        imgsz = check_img_size(imgsz, s=stride)  # check image size

        # Half
        half &= (pt or jit or onnx or engine) and device.type != 'cpu'  # FP16 supported on limited backends with CUDA
        if pt or jit:
            model.model.half() if half else model.model.float()

        # Dataloader
        if webcam:
            view_img = check_imshow()
            cudnn.benchmark = True  # set True to speed up constant image size inference
            dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
            bs = len(dataset)  # batch_size
        else:
            dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
            bs = 1  # batch_size
        vid_path, vid_writer = [None] * bs, [None] * bs

        # Run inference
        model.warmup(imgsz=(1 if pt else bs, 3, *imgsz), half=half)  # warmup
        dt, seen = [0.0, 0.0, 0.0], 0
        for path, im, im0s, vid_cap, s in dataset:
            flag = 0
            c = 1
            time1 = 6
            # t1 = time_sync()
            im = torch.from_numpy(im).to(device)
            im = im.half() if half else im.float()  # uint8 to fp16/32
            im /= 255  # 0 - 255 to 0.0 - 1.0
            if len(im.shape) == 3:
                im = im[None]  # expand for batch dim
            # t2 = time_sync()
            # dt[0] += t2 - t1

            # Inference

            visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
            pred = model(im, augment=augment, visualize=visualize)
            # t3 = time_sync()
            # dt[1] += t3 - t2

            # NMS
            pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)

            # dt[2] += time_sync() - t3

            # Second-stage classifier (optional)
            # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)

            # Process predictions
            for i, det in enumerate(pred):  # per image
                seen += 1
                count = 0
                if webcam:  # batch_size >= 1
                    p, im0, frame = path[i], im0s[i].copy(), dataset.count
                    s += f'i: '
                else:
                    p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)

                p = Path(p)  # to Path
                save_path = str(save_dir / p.name)  # im.jpg
                txt_path = str(save_dir / 'labels' / p.stem) + (
                    '' if dataset.mode == 'image' else f'_frame')  # im.txt
                s += '%gx%g ' % im.shape[2:]  # print string
                gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
                imc = im0.copy() if save_crop else im0  # for save_crop
                annotator = Annotator(im0, line_width=line_thickness, example=str(names))
                if len(det):
                    # Rescale boxes from img_size to im0 size
                    det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()

                    # Print results
                    for c in det[:, -1].unique():
                        n = (det[:, -1] == c).sum()  # detections per class
                        s += f"n names[int(c)]'s' * (n > 1), "  # add to string

                    # Write results
                    for *xyxy, conf, cls in reversed(det):
                        count = 1
                        if save_txt:  # Write to file
                            xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                            line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
                            with open(txt_path + '.txt', 'a') as f:
                                f.write(('%g ' * len(line)).rstrip() % line + '\\n')

                        if save_img or save_crop or view_img:  # Add bbox to image
                            c = int(cls)  # integer class
                            label = None if hide_labels else (names[c] if hide_conf else f'names[c] conf:.2f')
                            annotator.box_label(xyxy, label, color=colors(c, True))
                            if save_crop:
                                save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'p.stem.jpg', BGR=True)
                        box = xyxy        
                        c = int(cls)  # integer class
                        p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
                        lw = max(round(sum(im0.shape) / 2 * 0.003), 2)
                        cv2.rectangle(im0, p1, p2, color=(0, 0, 255),
                                      thickness=max(round(sum(im0.shape) / 2 * 0.003), 2), lineType=cv2.LINE_AA)
                        label = (f'names[c] conf:.2f')
                        tf = max(lw - 1, 1)
                        w, h = cv2.getTextSize(label, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
                        outside = p1[1] - h - 3 >= 0  # label fits outside box
                        cv2.putText(im0, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), 0, lw / 3,
                                    (0, 0, 255),
                                    thickness=tf, lineType=cv2.LINE_AA)

                # Stream results
                im0 = annotator.result()
                if view_img:
                    cv2.imshow(str(p), im0)
                    cv2.waitKey(1)  # 1 millisecond
                if (seen % time1 == 0):
                    if (count == 0):
                        save_c = 0
                    else:
                        save_c = save_c + 1
                if(save_c>=4):
                    if keep == 0:
                        im0 = cv2.cvtColor(im0, cv2.COLOR_BGR2RGB)
                        frame = Image.fromarray(np.uint8(im0))
                        #print(save_path)
                        frame.save(str(save_path.split('.')[0]) + ".jpg")
                        keep = 1
                        shutil.copy(so, save_path)
                        print('have animal')
                        break
            else:
                continue
            break


            # # Save results (image with detections)
            # if save_img:
            #     if dataset.mode == 'image':
            #         cv2.imwrite(save_path, im0)
            #     else:  # 'video' or 'stream'
            #         if vid_path[i] != save_path:  # new video
            #             vid_path[i] = save_path
            #             if isinstance(vid_writer[i], cv2.VideoWriter):
            #                 vid_writer[i].release()  # release previous video writer
            #             if vid_cap:  # video
            #                 fps = vid_cap.get(cv2.CAP_PROP_FPS)
            #                 w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            #                 h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            #             else:  # stream
            #                 fps, w, h = 30, im0.shape[1], im0.shape[0]
            #             save_path = str(Path(save_path).with_suffix('.mp4'))  # force *.mp4 suffix on results videos
            #             vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
            #         vid_writer[i].write(im0)

            # Print time (inference-only)
            # LOGGER.info(f'sDone. (t3 - t2:.3fs)')

        # Print results
        # t = tuple(x / seen * 1E3 for x in dt)  # speeds per image
        # LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape (1, 3, *imgsz)' % t)
        if save_txt or save_img:
            s = f"\\nlen(list(save_dir.glob('labels/*.txt'))) labels saved to save_dir / 'labels'" if save_txt else ''
            # LOGGER.info(f"Results saved to colorstr('bold', save_dir)s")
        if update:
            strip_optimizer(weights)  # update model (to fix SourceChangeWarning)

        time1_end = time.time()
        print('视频%d处理时间' % number + str(time1_end-time1_start))
        # if bool == True:
        #     shutil.copy(so, save_path)
        # else:
        #     pass


def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'weights/best.pt', help='model path(s)')
    parser.add_argument('--vidpath', type=str, default='/home/ccf_disk/animal/video/4-3/',
                        help='file/dir/URL/glob, 0 for webcam')
    parser.add_argument('--data', type=str, default=ROOT / 'data/myvoc.yaml', help='(optional) dataset.yaml path')
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w')
    parser.add_argument('--conf-thres', type=float, default=0.75, help='confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--view-img', action='store_true', help='show results')
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --classes 0, or --classes 0 2 3')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    parser.add_argument('--augment', action='store_true', help='augmented inference')
    parser.add_argument('--visualize', action='store_true', help='visualize features')
    parser.add_argument('--update', action='store_true', help='update all models')
    parser.add_argument('--project', default='/home/ccf_disk/animal/video_animal_yolov5/', help='save results to project/name')
    parser.add_argument('--name', default='4-3', help='save results to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
    parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
    parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
    opt = parser.parse_args()
    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
    print_args(FILE.stem, opt)
    return opt


def main(opt):
    check_requirements(exclude=('tensorboard', 'thop'))
    run(**vars(opt))


if __name__ == "__main__":
    opt = parse_opt()
    main(opt)

有点深),第一次发博客,浅记录一下。

yolov5-master代码详解笔记——detect模块

为保证您获得更好的学习体验,请使用电脑阅读学习,本专栏对手机用户并不友好!

本文将以detec.py文件为主,带你从头开始逐一追踪代码,了解detect运行流程。

目录

detect.py:

common.py(models):

    DetectMultiBackend:       (line279)

datasets.py(utils):

       LoadImages:                      (line178)

augmentations.py(utils):

       letterbox:                                  (line91)

plots.py(utils):

       Annotator:                               (line68)


detect.py:

FILE                当前文件绝对路径

ROOT              整个yolov5项目的路径(多数情况下在文件的下载转移更新时已存在包导入时无法查找,则可查看该路径是否正确

Parse_opt        定参,返回opt(存储所有参数信息)

Main                 ①检测requirement中依赖包

                         ②执行Run

Run

                        1.判断source传入数据

                                ①is_file: 判断输入图片格式是否在设定格式中(dataset.py

                                ②webcam:false

                         2. Directories,新建保存结果文件夹

# Directories
    save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

                         3. Load model,加载模型

# Load model
    device = select_device(device)
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
    stride, names, pt = model.stride, model.names, model.pt
    imgsz = check_img_size(imgsz, s=stride)  # check image size

                                ①device:选择设备,摄像头、GPU、CPU等

                                ②model:(weight,coco.yaml)显示后端框架(pytorch、TorchScri等)

                                            DetectMultiBackend(common.py)

                                ③加载模型数据

                                ④imgsz保证图片尺寸为32的倍数,不是则自动计算出32倍数尺寸

                         4. Dataloader,加载待预测图片

# Dataloader
    if webcam:
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
        bs = len(dataset)  # batch_size
    else:
        dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
        bs = 1  # batch_size
    vid_path, vid_writer = [None] * bs, [None] * bs

                                dataset=LoadImages(datasets.py)  初始化

                         5. Run inference,输入模型推理产生推理结果画出识别框:

                                初始化:

                                       Warmup:传入一张空图片到GPU预热

                                       遍历dataset(LoadImages):

                                       im: 图片numpy转pytroch支持的格式

                                              /=255:归一化

                                       扩张维度

                                Inference: 预测

                                       visualize(默认false):若为true,保存推断过程特征图

                                       pred:                         检测框

                                                augment:可对推断做数据增强,但降低模型运行速度

                                                [1,18900,85]:85指4个坐标信息,1个置信度,80个类别概率

                                NMS:   非极大值过滤

                                       pred: 1,5,6: 6指4坐标,1置信度,1类别

                                Process:

                                       det: [5,6],5个矩形框, 6指4坐标,1置信度,1类别

                                       seen:计数器

                                       save_path:图片保存路径

                                       txt_path:默认不保存txt文件

                                       s:

                                       gn:获得原图宽高,保存txt时有用

                                       imc:判断是否把检测框裁剪保存

                                       annotator(plots.py):原图绘制

                                       if(det):   画框

                                              det[]       从调整图中坐标映射回原图

                                              遍历所有框:n统计所有框->s打印信息

                                       write results:选择保存方式

                                              add bbox to image(默认选择):

                                                     label:hide_labels、hide_conf(detect参数)是否打印

                                                            标签、置信度

                                                     annotator.box_label:画框

                                                     save_crop:默认false,是否保存截取检测框

                                       stream:       (view_img)展示结果

                                       save_img:    保存图片

                         6. Print results,打印输出结果

 # Print results
    t = tuple(x / seen * 1E3 for x in dt)  # speeds per image
    LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape (1, 3, *imgsz)' % t)
    if save_txt or save_img:
        s = f"\\nlen(list(save_dir.glob('labels/*.txt'))) labels saved to save_dir / 'labels'" if save_txt else ''
        LOGGER.info(f"Results saved to colorstr('bold', save_dir)s")
    if update:
        strip_optimizer(weights)  # update model (to fix SourceChangeWarning)

                                       t:  统计预测每张图片平均时间

                                              seen:预测图片数量,dt每张图片所用时间

                                              LOGGER.info:日志

detect.py源码注释:

有小改动,可对比自身本地项目文件阅读

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
"""
Run inference on images, videos, directories, streams, etc.

Usage - sources:
    $ python path/to/detect.py --weights yolov5s.pt --source 0              # webcam
                                                             img.jpg        # image
                                                             vid.mp4        # video
                                                             path/          # directory
                                                             path/*.jpg     # glob
                                                             'https://youtu.be/Zgi9g1ksQHc'  # YouTube
                                                             'rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP stream

Usage - formats:
    $ python path/to/detect.py --weights yolov5s.pt                 # PyTorch
                                         yolov5s.torchscript        # TorchScript
                                         yolov5s.onnx               # ONNX Runtime or OpenCV DNN with --dnn
                                         yolov5s.xml                # OpenVINO
                                         yolov5s.engine             # TensorRT
                                         yolov5s.mlmodel            # CoreML (MacOS-only)
                                         yolov5s_saved_model        # TensorFlow SavedModel
                                         yolov5s.pb                 # TensorFlow GraphDef
                                         yolov5s.tflite             # TensorFlow Lite
                                         yolov5s_edgetpu.tflite     # TensorFlow Edge TPU
"""

import argparse
import os
import sys
from pathlib import Path

import cv2
import torch
import torch.backends.cudnn as cudnn

FILE = Path(__file__).resolve()      # 当前文件绝对路径
# 整个yolov5项目的路径
ROOT = FILE.parents[0]  # YOLOv5 root directory
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative

from models.common import DetectMultiBackend
from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
                           increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
from utils.plots import Annotator, colors, save_one_box
from utils.torch_utils import select_device, time_sync


@torch.no_grad()
def run(weights=ROOT / 'yolov5s.pt',  # model.pt path(s)
        source=ROOT / 'data/images',  # file/dir/URL/glob, 0 for webcam
        data=ROOT / 'data/coco128.yaml',  # dataset.yaml path
        imgsz=(640, 640),  # inference size (height, width)
        conf_thres=0.25,  # confidence threshold
        iou_thres=0.45,  # NMS IOU threshold
        max_det=1000,  # maximum detections per image
        device='',  # cuda device, i.e. 0 or 0,1,2,3 or cpu
        view_img=False,  # show results
        save_txt=False,  # save results to *.txt
        save_conf=False,  # save confidences in --save-txt labels
        save_crop=False,  # save cropped prediction boxes
        nosave=False,  # do not save images/videos
        classes=None,  # filter by class: --class 0, or --class 0 2 3
        agnostic_nms=False,  # class-agnostic NMS
        augment=False,  # augmented inference
        visualize=False,  # visualize features
        update=False,  # update all models
        project=ROOT / 'runs/detect',  # save results to project/name
        name='exp',  # save results to project/name
        exist_ok=False,  # existing project/name ok, do not increment
        line_thickness=3,  # bounding box thickness (pixels)
        hide_labels=False,  # hide labels
        hide_conf=False,  # hide confidences
        half=False,  # use FP16 half-precision inference
        dnn=False,  # use OpenCV DNN for ONNX inference
        ):
    source = str(source)
    save_img = not nosave and not source.endswith('.txt')  # save inference images
    # 判断输入的是否为文件地址且是否包含于相应格式
    is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
    is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))   # 判断是否为网络流地址(false)
    # 判断是否传入是0-打开电脑摄像头,默认false
    webcam = source.isnumeric() or source.endswith('.txt') or (is_url and not is_file)
    if is_url and is_file:
        source = check_file(source)  # download

    # Directories
    # 新建保存结果文件夹
    save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

    # Load model
    # 加载模型
    device = select_device(device)   # 选择设备,摄像头、GPU、CPU等
    # 显示后端框架(pytorch、TorchScri等)
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
    stride, names, pt = model.stride, model.names, model.pt         # 加载模型数据
    # 保证图片尺寸为32的倍数,不是则自动计算出32倍数尺寸
    imgsz = check_img_size(imgsz, s=stride)  # check image size

    # Dataloader
    # 加载待预测图片
    if webcam:
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
        bs = len(dataset)  # batch_size
    else:
        # 初始化
        dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
        bs = 1  # batch_size
    vid_path, vid_writer = [None] * bs, [None] * bs

    # Run inference
    # 输入模型推理产生推理结果画出识别框:
    # 传入一张空图片到GPU预热
    model.warmup(imgsz=(1 if pt else bs, 3, *imgsz))  # warmup
    dt, seen = [0.0, 0.0, 0.0], 0
    for path, im, im0s, vid_cap, s in dataset:
        t1 = time_sync()
        # 图片numpy转pytroch支持的格式
        im = torch.from_numpy(im).to(device)
        im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
        # 图片归一化
        im /= 255  # 0 - 255 to 0.0 - 1.0
        # 扩张维度
        if len(im.shape) == 3:
            im = im[None]  # expand for batch dim
        t2 = time_sync()
        dt[0] += t2 - t1

        # Inference
        # 若为true,保存推断过程特征图
        visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
        pred = model(im, augment=augment, visualize=visualize)      # 检测框[1,18900,85]
        # [1,18900,85]指4个坐标信息,1个置信度,80个类别概率
        # augment可对推断做数据增强,但降低模型运行速度
        t3 = time_sync()
        dt[1] += t3 - t2

        # NMS
        # 非极大值过滤
        pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)     # [1,5,6]
        # [1,5,6]: 6指4坐标,1置信度,1类别
        dt[2] += time_sync() - t3

        # Second-stage classifier (optional)
        # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)

        # Process predictions
        # det:[5,6],5个矩形框, 6指4坐标,1置信度,1类别
        for i, det in enumerate(pred):  # per image
            # 计数器
            seen += 1
            if webcam:  # batch_size >= 1
                p, im0, frame = path[i], im0s[i].copy(), dataset.count
                s += f'i: '
            else:
                p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)

            p = Path(p)  # to Path
            # 图片保存路径
            save_path = str(save_dir / p.name)  # im.jpg
            # 默认不保存txt文件
            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_frame')  # im.txt
            s += '%gx%g ' % im.shape[2:]  # print string
            # 获得原图宽高,用于保存txt
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            # 判断是否把检测框裁剪保存
            imc = im0.copy() if save_crop else im0  # for save_crop
            # 原图绘制
            annotator = Annotator(im0, line_width=line_thickness, example=str(names))
            # 画框
            if len(det):
                # Rescale boxes from img_size to im0 size
                # 从调整图中坐标映射回原图
                det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()

                # Print results
                # 遍历所有框:n统计所有框用于基于s打印信息
                for c in det[:, -1].unique():
                    n = (det[:, -1] == c).sum()  # detections per class
                    s += f"n names[int(c)]'s' * (n > 1), "  # add to string

                # Write results
                # 选择保存方式(默认选择第二种,即非txt)
                for *xyxy, conf, cls in reversed(det):
                    if save_txt:  # Write to file
                        xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                        line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
                        with open(txt_path + '.txt', 'a') as f:
                            f.write(('%g ' * len(line)).rstrip() % line + '\\n')

                    if save_img or save_crop or view_img:  # Add bbox to image
                        c = int(cls)  # integer class
                        # 是否打印标签、置信度
                        label = None if hide_labels else (names[c] if hide_conf else f'names[c] conf:.2f')
                        # 画框
                        annotator.box_label(xyxy, label, color=colors(c, True))
                        # 是否保存截取检测框(默认false)
                        if save_crop:
                            save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'p.stem.jpg', BGR=True)

            # Stream results
            im0 = annotator.result()
            if view_img:
                cv2.imshow(str(p), im0)
                cv2.waitKey(1)  # 1 millisecond

            # Save results (image with detections)
            if save_img:
                if dataset.mode == 'image':
                    cv2.imwrite(save_path, im0)
                else:  # 'video' or 'stream'
                    if vid_path[i] != save_path:  # new video
                        vid_path[i] = save_path
                        if isinstance(vid_writer[i], cv2.VideoWriter):
                            vid_writer[i].release()  # release previous video writer
                        if vid_cap:  # video
                            fps = vid_cap.get(cv2.CAP_PROP_FPS)
                            w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                            h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                        else:  # stream
                            fps, w, h = 30, im0.shape[1], im0.shape[0]
                        save_path = str(Path(save_path).with_suffix('.mp4'))  # force *.mp4 suffix on results videos
                        vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                    vid_writer[i].write(im0)

        # Print time (inference-only)
        LOGGER.info(f'sDone. (t3 - t2:.3fs)')

    # Print results
    # 统计预测每张图片平均时间,seen即预测图片数量,dt即每张图片所用时间
    t = tuple(x / seen * 1E3 for x in dt)  # speeds per image
    LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape (1, 3, *imgsz)' % t)
    if save_txt or save_img:
        s = f"\\nlen(list(save_dir.glob('labels/*.txt'))) labels saved to save_dir / 'labels'" if save_txt else ''
        LOGGER.info(f"Results saved to colorstr('bold', save_dir)s")
    if update:
        strip_optimizer(weights)  # update model (to fix SourceChangeWarning)

# 定参,返回opt(存储所有参数信息)
def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'runs/train//exp13/weights/best.pt', help='model path(s)')
    parser.add_argument('--source', type=str, default=ROOT / 'C:/Users/Pictures/Saved Pictures/read11.avi',  help='file/dir/URL/glob, 0 for webcam')
    parser.add_argument('--data', type=str, default=ROOT / 'data/coco.yaml', help='(optional) dataset.yaml path')
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[960, 540], help='inference size h,w')
    parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--view-img', action='store_true', help='show results')      # 置信度
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --classes 0, or --classes 0 2 3')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    parser.add_argument('--augment', action='store_true', help='augmented inference')
    parser.add_argument('--visualize', action='store_true', help='visualize features')
    parser.add_argument('--update', action='store_true', help='update all models')
    parser.add_argument('--project', default=ROOT / 'runs/detect', help='save results to project/name')
    parser.add_argument('--name', default='exp', help='save results to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
    parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
    parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
    opt = parser.parse_args()
    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
    print_args(FILE.stem, opt)
    return opt


def main(opt):
    # 检测requirement中依赖包
    check_requirements(exclude=('tensorboard', 'thop'))
    run(**vars(opt))


if __name__ == "__main__":
    opt = parse_opt()
    main(opt)

common.py(models):

    DetectMultiBackend:       (line279)

        w                             判断weights是否为list,若是取出第一个值作为传入路径

        model_type             判断模型格式(pt、jit等),执行相应加载模式

        fp16                         半精度推算

        if data                      加载传入文件,获取names

datasets.py(utils):

       LoadImages:                      (line178)

         P:                            根据相对路径获得绝对路径

                                        判断是否带*,是否为文件夹,是否为文件

         images/videos:      获取文件格式,判断图片格式是否包含在规定拓展名中

         nf:                         所有文件数

         count:                   文件中图片计数器,起索引作用

         img0:                    读入初始图

         s:                           字符串,表示输入的是第几张图片,用于后续打印

         img:                      调整图片尺寸(augmentations.py->letterbox)(需要32倍宽高)

         vid_cap(None):

         Convert:

augmentations.py(utils):

       letterbox:                                  (line91)

         r:                       长边缩放图片,(long/640)

         填充图片:

         if auto:              若auto(默认true)为true,判断图片宽高是否为32倍数,若满足直接读取

plots.py(utils):

       Annotator:                               (line68)

         初始化:

                     If-else:        默认opencv画框

                     box_label:   画框画标签

本文谨以用于代码详解笔记,而非项目点评和项目研发,若本文何处理解有误,劳烦广大读者指正。本人才疏学浅,请多包涵。

以上是关于修改YOLOv5 detect.py代码使其能逐个视频检测保存,同时对每个视频内参数进行单独操作的主要内容,如果未能解决你的问题,请参考以下文章

魔改并封装 YoloV5 Version7 的 detect.py 成 API接口以供 python 程序使用

yolov5检测视频流的原理detect.py解读

yolov5-master代码详解笔记——detect模块

自定义yolov5-v7的推理方法

YOLOv5结构分析与理解—图解

YOLOv5学习笔记——detect.py.detect()