前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Yolov8 源码解析(四十)

Yolov8 源码解析(四十)

作者头像
ApacheCN_飞龙
发布2024-09-13 17:19:43
1660
发布2024-09-13 17:19:43
举报
文章被收录于专栏:信数据得永生

.\yolov8\ultralytics\utils\benchmarks.py

代码语言:javascript
复制
# 从 glob 模块中导入 glob 函数,用于文件路径的模糊匹配
import glob
# 导入 os 模块,提供了许多与操作系统交互的函数
import os
# 导入 platform 模块,用于获取系统平台信息
import platform
# 导入 re 模块,支持正则表达式操作
import re
# 导入 shutil 模块,提供了高级的文件操作功能
import shutil
# 导入 time 模块,提供时间相关的功能
import time
# 从 pathlib 模块中导入 Path 类,用于操作文件路径
from pathlib import Path

# 导入 numpy 库,用于数值计算
import numpy as np
# 导入 torch.cuda 模块,用于 CUDA 相关操作
import torch.cuda
# 导入 yaml 库,用于处理 YAML 格式的文件
import yaml

# 从 ultralytics 包中导入 YOLO 和 YOLOWorld 类
from ultralytics import YOLO, YOLOWorld
# 从 ultralytics.cfg 模块中导入 TASK2DATA 和 TASK2METRIC 变量
from ultralytics.cfg import TASK2DATA, TASK2METRIC
# 从 ultralytics.engine.exporter 模块中导入 export_formats 函数
from ultralytics.engine.exporter import export_formats
# 从 ultralytics.utils 模块中导入 ARM64, ASSETS, IS_JETSON, IS_RASPBERRYPI 等变量
from ultralytics.utils import ARM64, ASSETS, IS_JETSON, IS_RASPBERRYPI, LINUX, LOGGER, MACOS, TQDM, WEIGHTS_DIR
# 从 ultralytics.utils.checks 模块中导入 IS_PYTHON_3_12, check_requirements, check_yolo 等函数和变量
from ultralytics.utils.checks import IS_PYTHON_3_12, check_requirements, check_yolo
# 从 ultralytics.utils.downloads 模块中导入 safe_download 函数
from ultralytics.utils.downloads import safe_download
# 从 ultralytics.utils.files 模块中导入 file_size 函数
from ultralytics.utils.files import file_size
# 从 ultralytics.utils.torch_utils 模块中导入 select_device 函数
from ultralytics.utils.torch_utils import select_device


def benchmark(
    model=WEIGHTS_DIR / "yolov8n.pt", data=None, imgsz=160, half=False, int8=False, device="cpu", verbose=False
):
    """
    Benchmark a YOLO model across different formats for speed and accuracy.

    Args:
        model (str | Path | optional): Path to the model file or directory. Default is
            Path(SETTINGS['weights_dir']) / 'yolov8n.pt'.
        data (str, optional): Dataset to evaluate on, inherited from TASK2DATA if not passed. Default is None.
        imgsz (int, optional): Image size for the benchmark. Default is 160.
        half (bool, optional): Use half-precision for the model if True. Default is False.
        int8 (bool, optional): Use int8-precision for the model if True. Default is False.
        device (str, optional): Device to run the benchmark on, either 'cpu' or 'cuda'. Default is 'cpu'.
        verbose (bool | float | optional): If True or a float, assert benchmarks pass with given metric.
            Default is False.
    """
    # 函数主体,用于评估 YOLO 模型在不同格式下的速度和准确性,参数详细说明在函数文档字符串中给出
    pass  # 这里是示例,实际代码会在此基础上继续开发
    def benchmark(model='yolov8n.pt', imgsz=640):
        """
        Benchmark function to evaluate model performance.
    
        Args:
            model (str or Path): Path to the model checkpoint.
            imgsz (int): Image size for inference.
    
        Returns:
            df (pandas.DataFrame): A pandas DataFrame with benchmark results for each format, including file size,
                metric, and inference time.
    
        Example:
            ```python
            from ultralytics.utils.benchmarks import benchmark
    
            benchmark(model='yolov8n.pt', imgsz=640)
            ```
        """
        import pandas as pd  # Import pandas library for DataFrame operations
        pd.options.display.max_columns = 10  # Set maximum display columns in pandas DataFrame
        pd.options.display.width = 120  # Set display width for pandas DataFrame
    
        device = select_device(device, verbose=False)  # Select device for model inference
        if isinstance(model, (str, Path)):
            model = YOLO(model)  # Initialize YOLO model if model is given as a string or Path
    
        is_end2end = getattr(model.model.model[-1], "end2end", False)  # Check if model supports end-to-end inference
    
        y = []  # Initialize an empty list to store benchmark results
        t0 = time.time()  # Record current time for benchmarking purposes
    
        check_yolo(device=device)  # Print system information relevant to YOLO
    
        # Create a pandas DataFrame 'df' with columns defined for benchmark results
        df = pd.DataFrame(y, columns=["Format", "Status❔", "Size (MB)", key, "Inference time (ms/im)", "FPS"])
    
        name = Path(model.ckpt_path).name  # Extract the name of the model checkpoint file
        # Construct a string 's' summarizing benchmark results and logging information
        s = f"\nBenchmarks complete for {name} on {data} at imgsz={imgsz} ({time.time() - t0:.2f}s)\n{df}\n"
        LOGGER.info(s)  # Log 's' to the logger file
    
        with open("benchmarks.log", "a", errors="ignore", encoding="utf-8") as f:
            f.write(s)  # Append string 's' to the 'benchmarks.log' file
    
        if verbose and isinstance(verbose, float):
            metrics = df[key].array  # Extract the 'key' column values from the DataFrame 'df'
            floor = verbose  # Set the minimum metric floor to compare against
            # Assert that all metrics are greater than 'floor' if they are not NaN
            assert all(x > floor for x in metrics if pd.notna(x)), f"Benchmark failure: metric(s) < floor {floor}"
    
        return df  # Return the pandas DataFrame 'df' containing benchmark results
class RF100Benchmark:
    """Benchmark YOLO model performance across formats for speed and accuracy."""

    def __init__(self):
        """Function for initialization of RF100Benchmark."""
        # 初始化空列表,用于存储数据集名称
        self.ds_names = []
        # 初始化空列表,用于存储数据集配置文件路径
        self.ds_cfg_list = []
        # 初始化 RF 对象为 None
        self.rf = None
        # 定义验证指标列表
        self.val_metrics = ["class", "images", "targets", "precision", "recall", "map50", "map95"]

    def set_key(self, api_key):
        """
        Set Roboflow API key for processing.

        Args:
            api_key (str): The API key.
        """
        # 检查是否满足 Roboflow 相关的依赖
        check_requirements("roboflow")
        # 导入 Roboflow 模块
        from roboflow import Roboflow
        # 创建 Roboflow 对象并设置 API 密钥
        self.rf = Roboflow(api_key=api_key)

    def parse_dataset(self, ds_link_txt="datasets_links.txt"):
        """
        Parse dataset links and downloads datasets.

        Args:
            ds_link_txt (str): Path to dataset_links file.
        """
        # 如果存在 rf-100 目录,则删除并重新创建;否则直接创建
        (shutil.rmtree("rf-100"), os.mkdir("rf-100")) if os.path.exists("rf-100") else os.mkdir("rf-100")
        # 切换当前工作目录至 rf-100
        os.chdir("rf-100")
        # 在 rf-100 目录下创建 ultralytics-benchmarks 目录
        os.mkdir("ultralytics-benchmarks")
        # 安全下载 datasets_links.txt 文件
        safe_download("https://github.com/ultralytics/assets/releases/download/v0.0.0/datasets_links.txt")

        # 打开数据集链接文件,逐行处理
        with open(ds_link_txt, "r") as file:
            for line in file:
                try:
                    # 使用正则表达式拆分数据集链接
                    _, url, workspace, project, version = re.split("/+", line.strip())
                    # 将项目名称添加到数据集名称列表
                    self.ds_names.append(project)
                    # 组合项目和版本信息
                    proj_version = f"{project}-{version}"
                    # 如果该版本数据集尚未下载,则使用 Roboflow 对象下载到 yolov8 目录下
                    if not Path(proj_version).exists():
                        self.rf.workspace(workspace).project(project).version(version).download("yolov8")
                    else:
                        print("Dataset already downloaded.")
                    # 添加数据集配置文件路径到列表中
                    self.ds_cfg_list.append(Path.cwd() / proj_version / "data.yaml")
                except Exception:
                    continue

        return self.ds_names, self.ds_cfg_list

    @staticmethod
    def fix_yaml(path):
        """
        Function to fix YAML train and val path.

        Args:
            path (str): YAML file path.
        """
        # 使用安全加载方式读取 YAML 文件
        with open(path, "r") as file:
            yaml_data = yaml.safe_load(file)
        # 修改 YAML 文件中的训练和验证路径
        yaml_data["train"] = "train/images"
        yaml_data["val"] = "valid/images"
        # 使用安全写入方式将修改后的 YAML 数据写回文件
        with open(path, "w") as file:
            yaml.safe_dump(yaml_data, file)
    def evaluate(self, yaml_path, val_log_file, eval_log_file, list_ind):
        """
        Model evaluation on validation results.

        Args:
            yaml_path (str): YAML file path.
            val_log_file (str): val_log_file path.
            eval_log_file (str): eval_log_file path.
            list_ind (int): Index for current dataset.
        """
        # 定义跳过的符号列表,这些符号出现在日志行中时将被跳过
        skip_symbols = ["🚀", "⚠️", "💡", "❌"]
        
        # 从 YAML 文件中读取类别名称列表
        with open(yaml_path) as stream:
            class_names = yaml.safe_load(stream)["names"]
        
        # 打开验证日志文件,读取其中的所有行
        with open(val_log_file, "r", encoding="utf-8") as f:
            lines = f.readlines()
            eval_lines = []
            
            # 遍历每一行日志
            for line in lines:
                # 如果日志行包含需要跳过的符号,则跳过此行
                if any(symbol in line for symbol in skip_symbols):
                    continue
                
                # 将每行日志按空格分隔为条目列表
                entries = line.split(" ")
                # 过滤空字符串并去除每个条目结尾的换行符
                entries = list(filter(lambda val: val != "", entries))
                entries = [e.strip("\n") for e in entries]
                
                # 将符合条件的条目加入到评估结果列表中
                eval_lines.extend(
                    {
                        "class": entries[0],
                        "images": entries[1],
                        "targets": entries[2],
                        "precision": entries[3],
                        "recall": entries[4],
                        "map50": entries[5],
                        "map95": entries[6],
                    }
                    for e in entries
                    if e in class_names or (e == "all" and "(AP)" not in entries and "(AR)" not in entries)
                )
        
        # 初始化 map_val 变量为 0.0
        map_val = 0.0
        
        # 如果评估结果列表中条目数量大于 1,则进行下列操作
        if len(eval_lines) > 1:
            print("There's more dicts")
            # 遍历评估结果列表中的每一个字典
            for lst in eval_lines:
                # 如果当前字典的类别为 "all",则将 map_val 设置为其 map50 值
                if lst["class"] == "all":
                    map_val = lst["map50"]
        else:
            print("There's only one dict res")
            # 否则,如果评估结果列表中只有一个字典,则将 map_val 设置为第一个字典的 map50 值
            map_val = [res["map50"] for res in eval_lines][0]
        
        # 将结果写入评估日志文件中,格式为 "<数据集名称>: <map_val>"
        with open(eval_log_file, "a") as f:
            f.write(f"{self.ds_names[list_ind]}: {map_val}\n")
    """
    ProfileModels class for profiling different models on ONNX and TensorRT.

    This class profiles the performance of different models, returning results such as model speed and FLOPs.

    Attributes:
        paths (list): Paths of the models to profile.
        num_timed_runs (int): Number of timed runs for the profiling. Default is 100.
        num_warmup_runs (int): Number of warmup runs before profiling. Default is 10.
        min_time (float): Minimum number of seconds to profile for. Default is 60.
        imgsz (int): Image size used in the models. Default is 640.
        half (bool): Flag indicating whether to use half-precision floating point for profiling. Default is True.
        trt (bool): Flag indicating whether to use TensorRT for profiling. Default is True.
        device (torch.device): Device used for profiling. Automatically determined if None.

    Methods:
        profile(): Profiles the models and prints the result.

    Example:
        ```py
        from ultralytics.utils.benchmarks import ProfileModels

        ProfileModels(['yolov8n.yaml', 'yolov8s.yaml'], imgsz=640).profile()
        ```
    """

    def __init__(
        self,
        paths: list,
        num_timed_runs=100,
        num_warmup_runs=10,
        min_time=60,
        imgsz=640,
        half=True,
        trt=True,
        device=None,
    ):
        """
        Initialize the ProfileModels class for profiling models.

        Args:
            paths (list): List of paths of the models to be profiled.
            num_timed_runs (int, optional): Number of timed runs for the profiling. Default is 100.
            num_warmup_runs (int, optional): Number of warmup runs before the actual profiling starts. Default is 10.
            min_time (float, optional): Minimum time in seconds for profiling a model. Default is 60.
            imgsz (int, optional): Size of the image used during profiling. Default is 640.
            half (bool, optional): Flag to indicate whether to use half-precision floating point for profiling. Default is True.
            trt (bool, optional): Flag to indicate whether to profile using TensorRT. Default is True.
            device (torch.device, optional): Device used for profiling. If None, it is determined automatically.
        """
        # 初始化各个属性,用于存储传入的参数和设置默认值
        self.paths = paths
        self.num_timed_runs = num_timed_runs
        self.num_warmup_runs = num_warmup_runs
        self.min_time = min_time
        self.imgsz = imgsz
        self.half = half
        self.trt = trt  # 是否运行 TensorRT 的性能分析
        # 如果 device 为 None,则自动确定使用的设备
        self.device = device or torch.device(0 if torch.cuda.is_available() else "cpu")
    def profile(self):
        """
        Logs the benchmarking results of a model, checks metrics against floor and returns the results.
        """
        # 获取所有相关文件路径列表
        files = self.get_files()

        if not files:
            # 若没有找到匹配的 *.pt 或 *.onnx 文件,则打印消息并返回
            print("No matching *.pt or *.onnx files found.")
            return

        table_rows = []
        output = []
        for file in files:
            # 生成引擎文件名(后缀为 .engine)
            engine_file = file.with_suffix(".engine")
            if file.suffix in {".pt", ".yaml", ".yml"}:
                # 如果文件后缀是 .pt, .yaml 或 .yml,创建 YOLO 模型对象
                model = YOLO(str(file))
                model.fuse()  # 执行模型融合操作,以获取正确的参数和GFLOPs(在 model.info() 中)
                model_info = model.info()
                if self.trt and self.device.type != "cpu" and not engine_file.is_file():
                    # 如果启用 TensorRT(self.trt),且设备类型不是 CPU,并且引擎文件不存在,则导出为引擎文件
                    engine_file = model.export(
                        format="engine", half=self.half, imgsz=self.imgsz, device=self.device, verbose=False
                    )
                # 导出 ONNX 文件
                onnx_file = model.export(
                    format="onnx", half=self.half, imgsz=self.imgsz, simplify=True, device=self.device, verbose=False
                )
            elif file.suffix == ".onnx":
                # 如果文件后缀是 .onnx,获取 ONNX 模型信息
                model_info = self.get_onnx_model_info(file)
                onnx_file = file
            else:
                continue

            # 对 TensorRT 模型进行性能分析
            t_engine = self.profile_tensorrt_model(str(engine_file))
            # 对 ONNX 模型进行性能分析
            t_onnx = self.profile_onnx_model(str(onnx_file))
            # 生成表格行数据并添加到列表
            table_rows.append(self.generate_table_row(file.stem, t_onnx, t_engine, model_info))
            # 生成结果字典并添加到输出列表
            output.append(self.generate_results_dict(file.stem, t_onnx, t_engine, model_info))

        # 打印表格
        self.print_table(table_rows)
        # 返回结果输出列表
        return output

    def get_files(self):
        """
        Returns a list of paths for all relevant model files given by the user.
        """
        # 初始化文件列表
        files = []
        for path in self.paths:
            path = Path(path)
            if path.is_dir():
                # 如果路径是目录,则获取目录下所有匹配的文件路径
                extensions = ["*.pt", "*.onnx", "*.yaml"]
                files.extend([file for ext in extensions for file in glob.glob(str(path / ext))])
            elif path.suffix in {".pt", ".yaml", ".yml"}:  # add non-existing
                # 如果路径是文件且后缀符合条件,直接添加到文件列表中
                files.append(str(path))
            else:
                # 否则,获取路径下所有文件路径并添加到文件列表中
                files.extend(glob.glob(str(path)))

        # 打印正在分析的文件列表
        print(f"Profiling: {sorted(files)}")
        # 返回路径对象列表
        return [Path(file) for file in sorted(files)]

    def get_onnx_model_info(self, onnx_file: str):
        """
        Retrieves the information including number of layers, parameters, gradients and FLOPs for an ONNX model
        file.
        """
        # 暂时返回零值表示信息获取未实现
        return 0.0, 0.0, 0.0, 0.0  # return (num_layers, num_params, num_gradients, num_flops)
    def iterative_sigma_clipping(data, sigma=2, max_iters=3):
        """Applies an iterative sigma clipping algorithm to the given data."""
        # 将数据转换为 NumPy 数组
        data = np.array(data)
        # 执行最大迭代次数的循环
        for _ in range(max_iters):
            # 计算数据的平均值和标准差
            mean, std = np.mean(data), np.std(data)
            # 根据均值和标准差进行 sigma 剪切,并获取剪切后的数据
            clipped_data = data[(data > mean - sigma * std) & (data < mean + sigma * std)]
            # 如果剪切后的数据和原数据长度相同,则退出循环
            if len(clipped_data) == len(data):
                break
            # 更新数据为剪切后的数据,继续下一次迭代
            data = clipped_data
        # 返回最终剪切后的数据
        return data

    def profile_tensorrt_model(self, engine_file: str, eps: float = 1e-3):
        """Profiles the TensorRT model, measuring average run time and standard deviation among runs."""
        # 如果 TensorRT 未初始化或者引擎文件不存在,则返回默认值
        if not self.trt or not Path(engine_file).is_file():
            return 0.0, 0.0

        # 初始化模型和输入数据
        model = YOLO(engine_file)
        input_data = np.random.rand(self.imgsz, self.imgsz, 3).astype(np.float32)  # 必须是 FP32

        # 预热运行
        elapsed = 0.0
        for _ in range(3):
            start_time = time.time()
            for _ in range(self.num_warmup_runs):
                model(input_data, imgsz=self.imgsz, verbose=False)
            elapsed = time.time() - start_time

        # 计算运行次数,取最大值作为 min_time 或 num_timed_runs 的倍数
        num_runs = max(round(self.min_time / (elapsed + eps) * self.num_warmup_runs), self.num_timed_runs * 50)

        # 计时运行
        run_times = []
        for _ in TQDM(range(num_runs), desc=engine_file):
            results = model(input_data, imgsz=self.imgsz, verbose=False)
            # 提取推理速度并转换为毫秒
            run_times.append(results[0].speed["inference"])

        # 对运行时间进行 sigma 剪切
        run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=3)
        # 返回运行时间的平均值和标准差
        return np.mean(run_times), np.std(run_times)
    def profile_onnx_model(self, onnx_file: str, eps: float = 1e-3):
        """Profiles an ONNX model by executing it multiple times and returns the mean and standard deviation of run
        times.
        """
        # 检查运行环境是否满足要求,确保安装了'onnxruntime'库
        check_requirements("onnxruntime")
        import onnxruntime as ort

        # 创建会话选项对象,并设置图优化级别为最大,同时限制线程数为8
        sess_options = ort.SessionOptions()
        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.intra_op_num_threads = 8  # 限制并行执行的线程数目

        # 创建 ONNX 推理会话对象,指定使用CPU执行提供者
        sess = ort.InferenceSession(onnx_file, sess_options, providers=["CPUExecutionProvider"])

        # 获取模型输入张量信息
        input_tensor = sess.get_inputs()[0]
        input_type = input_tensor.type
        # 检查输入张量是否具有动态形状
        dynamic = not all(isinstance(dim, int) and dim >= 0 for dim in input_tensor.shape)
        # 根据动态形状设置输入张量的形状
        input_shape = (1, 3, self.imgsz, self.imgsz) if dynamic else input_tensor.shape

        # 将ONNX数据类型映射到numpy数据类型
        if "float16" in input_type:
            input_dtype = np.float16
        elif "float" in input_type:
            input_dtype = np.float32
        elif "double" in input_type:
            input_dtype = np.float64
        elif "int64" in input_type:
            input_dtype = np.int64
        elif "int32" in input_type:
            input_dtype = np.int32
        else:
            raise ValueError(f"Unsupported ONNX datatype {input_type}")

        # 生成随机输入数据,以输入张量的形状和数据类型为基础
        input_data = np.random.rand(*input_shape).astype(input_dtype)
        input_name = input_tensor.name
        output_name = sess.get_outputs()[0].name

        # 预热运行,执行若干次,计算平均时间
        elapsed = 0.0
        for _ in range(3):
            start_time = time.time()
            for _ in range(self.num_warmup_runs):
                sess.run([output_name], {input_name: input_data})
            elapsed = time.time() - start_time

        # 计算需要运行的总次数,确保满足最小时间要求或指定的运行次数
        num_runs = max(round(self.min_time / (elapsed + eps) * self.num_warmup_runs), self.num_timed_runs)

        # 正式计时运行
        run_times = []
        for _ in TQDM(range(num_runs), desc=onnx_file):
            start_time = time.time()
            sess.run([output_name], {input_name: input_data})
            run_times.append((time.time() - start_time) * 1000)  # 将运行时间转换为毫秒

        # 对运行时间进行迭代的sigma剪裁
        run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=5)
        # 返回运行时间的均值和标准差作为性能分析结果
        return np.mean(run_times), np.std(run_times)
    # 生成包含模型性能和指标详情的表格行的格式化字符串
    def generate_table_row(self, model_name, t_onnx, t_engine, model_info):
        """Generates a formatted string for a table row that includes model performance and metric details."""
        layers, params, gradients, flops = model_info
        return (
            f"| {model_name:18s} | {self.imgsz} | - | {t_onnx[0]:.2f} ± {t_onnx[1]:.2f} ms | {t_engine[0]:.2f} ± "
            f"{t_engine[1]:.2f} ms | {params / 1e6:.1f} | {flops:.1f} |"
        )

    @staticmethod
    # 生成包含模型名称、参数、GFLOPS和速度指标的字典
    def generate_results_dict(model_name, t_onnx, t_engine, model_info):
        """Generates a dictionary of model details including name, parameters, GFLOPS and speed metrics."""
        layers, params, gradients, flops = model_info
        return {
            "model/name": model_name,
            "model/parameters": params,
            "model/GFLOPs": round(flops, 3),
            "model/speed_ONNX(ms)": round(t_onnx[0], 3),
            "model/speed_TensorRT(ms)": round(t_engine[0], 3),
        }

    @staticmethod
    # 格式化并打印包含不同模型统计和性能数据的比较表格
    def print_table(table_rows):
        """Formats and prints a comparison table for different models with given statistics and performance data."""
        gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "GPU"
        header = (
            f"| Model | size<br><sup>(pixels) | mAP<sup>val<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) | "
            f"Speed<br><sup>{gpu} TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |"
        )
        separator = (
            "|-------------|---------------------|--------------------|------------------------------|"
            "-----------------------------------|------------------|-----------------|"
        )

        # 打印表格的标题和分隔线
        print(f"\n\n{header}")
        print(separator)
        # 打印每行表格内容
        for row in table_rows:
            print(row)

.\yolov8\ultralytics\utils\callbacks\base.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""Base callbacks."""

from collections import defaultdict
from copy import deepcopy

# Trainer callbacks ----------------------------------------------------------------------------------------------------

# 在训练器开始执行预训练流程前调用
def on_pretrain_routine_start(trainer):
    pass

# 在预训练流程结束后调用
def on_pretrain_routine_end(trainer):
    pass

# 在训练开始时调用
def on_train_start(trainer):
    pass

# 在每个训练 epoch 开始时调用
def on_train_epoch_start(trainer):
    pass

# 在每个训练 batch 开始时调用
def on_train_batch_start(trainer):
    pass

# 当优化器执行一步优化时调用
def optimizer_step(trainer):
    pass

# 在每个训练 batch 结束时调用
def on_train_batch_end(trainer):
    pass

# 在每个训练 epoch 结束时调用
def on_train_epoch_end(trainer):
    pass

# 在每个 fit epoch 结束时调用(包括训练和验证)
def on_fit_epoch_end(trainer):
    pass

# 当模型保存时调用
def on_model_save(trainer):
    pass

# 在训练结束时调用
def on_train_end(trainer):
    pass

# 当模型参数更新时调用
def on_params_update(trainer):
    pass

# 在训练过程拆除时调用
def teardown(trainer):
    pass

# Validator callbacks --------------------------------------------------------------------------------------------------

# 在验证开始时调用
def on_val_start(validator):
    pass

# 在每个验证 batch 开始时调用
def on_val_batch_start(validator):
    pass

# 在每个验证 batch 结束时调用
def on_val_batch_end(validator):
    pass

# 在验证结束时调用
def on_val_end(validator):
    pass

# Predictor callbacks --------------------------------------------------------------------------------------------------

# 在预测开始时调用
def on_predict_start(predictor):
    pass

# 在每个预测 batch 开始时调用
def on_predict_batch_start(predictor):
    pass

# 在每个预测 batch 结束时调用
def on_predict_batch_end(predictor):
    pass

# 在预测后处理结束时调用
def on_predict_postprocess_end(predictor):
    pass

# 在预测结束时调用
def on_predict_end(predictor):
    pass

# Exporter callbacks ---------------------------------------------------------------------------------------------------

# 在模型导出开始时调用
def on_export_start(exporter):
    pass

# 在模型导出结束时调用
def on_export_end(exporter):
    pass
default_callbacks = {
    # 在训练器中运行的回调函数
    "on_pretrain_routine_start": [on_pretrain_routine_start],
    "on_pretrain_routine_end": [on_pretrain_routine_end],
    "on_train_start": [on_train_start],
    "on_train_epoch_start": [on_train_epoch_start],
    "on_train_batch_start": [on_train_batch_start],
    "optimizer_step": [optimizer_step],
    "on_before_zero_grad": [on_before_zero_grad],
    "on_train_batch_end": [on_train_batch_end],
    "on_train_epoch_end": [on_train_epoch_end],
    "on_fit_epoch_end": [on_fit_epoch_end],  # fit = train + val
    "on_model_save": [on_model_save],
    "on_train_end": [on_train_end],
    "on_params_update": [on_params_update],
    "teardown": [teardown],
    # 在验证器中运行的回调函数
    "on_val_start": [on_val_start],
    "on_val_batch_start": [on_val_batch_start],
    "on_val_batch_end": [on_val_batch_end],
    "on_val_end": [on_val_end],
    # 在预测器中运行的回调函数
    "on_predict_start": [on_predict_start],
    "on_predict_batch_start": [on_predict_batch_start],
    "on_predict_postprocess_end": [on_predict_postprocess_end],
    "on_predict_batch_end": [on_predict_batch_end],
    "on_predict_end": [on_predict_end],
    # 在导出器中运行的回调函数
    "on_export_start": [on_export_start],
    "on_export_end": [on_export_end],
}


def get_default_callbacks():
    """
    返回一个 default_callbacks 字典的副本,其中默认值为列表。

    Returns:
        (defaultdict): 使用 default_callbacks 的键,空列表作为默认值的 defaultdict。
    """
    return defaultdict(list, deepcopy(default_callbacks))


def add_integration_callbacks(instance):
    """
    向实例的回调函数中添加来自各种来源的集成回调函数。

    Args:
        instance (Trainer, Predictor, Validator, Exporter): 具有 'callbacks' 属性的对象,其值为回调函数列表的字典。
    """

    # 加载 HUB 回调函数
    from .hub import callbacks as hub_cb

    callbacks_list = [hub_cb]

    # 加载训练回调函数
    if "Trainer" in instance.__class__.__name__:
        from .clearml import callbacks as clear_cb
        from .comet import callbacks as comet_cb
        from .dvc import callbacks as dvc_cb
        from .mlflow import callbacks as mlflow_cb
        from .neptune import callbacks as neptune_cb
        from .raytune import callbacks as tune_cb
        from .tensorboard import callbacks as tb_cb
        from .wb import callbacks as wb_cb

        callbacks_list.extend([clear_cb, comet_cb, dvc_cb, mlflow_cb, neptune_cb, tune_cb, tb_cb, wb_cb])

    # 将回调函数添加到回调字典中
    for callbacks in callbacks_list:
        for k, v in callbacks.items():
            if v not in instance.callbacks[k]:
                instance.callbacks[k].append(v)

.\yolov8\ultralytics\utils\callbacks\clearml.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license

# 引入必要的日志器、设置和测试运行状态的标志
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING

# 尝试导入并验证 ClearML 相关的设置和环境
try:
    # 确保不在运行 pytest 时记录日志
    assert not TESTS_RUNNING
    # 确保 ClearML 整合已启用
    assert SETTINGS["clearml"] is True
    import clearml
    from clearml import Task

    # 确保 clearml 包已成功导入且有版本信息
    assert hasattr(clearml, "__version__")

except (ImportError, AssertionError):
    clearml = None


# 定义一个函数用于将文件路径列表中的图像作为调试样本记录到 ClearML 任务中
def _log_debug_samples(files, title="Debug Samples") -> None:
    """
    Log files (images) as debug samples in the ClearML task.

    Args:
        files (list): A list of file paths in PosixPath format.
        title (str): A title that groups together images with the same values.
    """
    import re

    # 如果当前存在 ClearML 任务,则依次处理文件
    if task := Task.current_task():
        for f in files:
            if f.exists():
                # 从文件名中提取批次号并转换为整数
                it = re.search(r"_batch(\d+)", f.name)
                iteration = int(it.groups()[0]) if it else 0
                # 将图像文件报告到 ClearML 任务日志
                task.get_logger().report_image(
                    title=title, series=f.name.replace(it.group(), ""), local_path=str(f), iteration=iteration
                )


# 定义一个函数用于将保存的图像文件作为绘图记录到 ClearML 的绘图部分
def _log_plot(title, plot_path) -> None:
    """
    Log an image as a plot in the plot section of ClearML.

    Args:
        title (str): The title of the plot.
        plot_path (str): The path to the saved image file.
    """
    import matplotlib.image as mpimg
    import matplotlib.pyplot as plt

    # 读取图像文件并创建绘图对象
    img = mpimg.imread(plot_path)
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[])  # 不显示刻度
    ax.imshow(img)

    # 报告 Matplotlib 绘制的图像到 ClearML 任务日志
    Task.current_task().get_logger().report_matplotlib_figure(
        title=title, series="", figure=fig, report_interactive=False
    )


# 定义一个函数,在预训练过程开始时初始化并连接/记录任务到 ClearML
def on_pretrain_routine_start(trainer):
    """Runs at start of pretraining routine; initializes and connects/ logs task to ClearML."""
    try:
        # 如果当前存在 ClearML 任务,则更新 PyTorch 和 Matplotlib 的绑定
        if task := Task.current_task():
            # 警告:确保禁用自动的 PyTorch 和 Matplotlib 绑定!
            # 我们正在手动在集成中记录这些绘图和模型文件
            from clearml.binding.frameworks.pytorch_bind import PatchPyTorchModelIO
            from clearml.binding.matplotlib_bind import PatchedMatplotlib

            PatchPyTorchModelIO.update_current_task(None)
            PatchedMatplotlib.update_current_task(None)
        else:
            # 否则初始化一个新的 ClearML 任务
            task = Task.init(
                project_name=trainer.args.project or "YOLOv8",
                task_name=trainer.args.name,
                tags=["YOLOv8"],
                output_uri=True,
                reuse_last_task_id=False,
                auto_connect_frameworks={"pytorch": False, "matplotlib": False},
            )
            # 记录警告信息,提示用户如何在远程环境运行 YOLO
            LOGGER.warning(
                "ClearML Initialized a new task. If you want to run remotely, "
                "please add clearml-init and connect your arguments before initializing YOLO."
            )
        # 将训练器参数连接到 ClearML 任务
        task.connect(vars(trainer.args), name="General")
    # 捕获所有异常并将其存储在变量e中
    except Exception as e:
        # 使用WARNING级别的日志记录器LOGGER记录警告消息,指出ClearML未正确初始化,
        # 因此不能记录这次运行的日志。同时输出异常信息e。
        LOGGER.warning(f"WARNING ⚠️ ClearML installed but not initialized correctly, not logging this run. {e}")
def on_train_epoch_end(trainer):
    """Logs debug samples for the first epoch of YOLO training and report current training progress."""
    # 获取当前任务对象,如果存在
    if task := Task.current_task():
        # 如果当前是第一个 epoch,则记录调试样本
        if trainer.epoch == 1:
            _log_debug_samples(sorted(trainer.save_dir.glob("train_batch*.jpg")), "Mosaic")
        # 报告当前训练进度
        for k, v in trainer.label_loss_items(trainer.tloss, prefix="train").items():
            task.get_logger().report_scalar("train", k, v, iteration=trainer.epoch)
        # 报告当前学习率
        for k, v in trainer.lr.items():
            task.get_logger().report_scalar("lr", k, v, iteration=trainer.epoch)


def on_fit_epoch_end(trainer):
    """Reports model information to logger at the end of an epoch."""
    # 获取当前任务对象,如果存在
    if task := Task.current_task():
        # 报告每个 epoch 的耗时
        task.get_logger().report_scalar(
            title="Epoch Time", series="Epoch Time", value=trainer.epoch_time, iteration=trainer.epoch
        )
        # 报告验证指标
        for k, v in trainer.metrics.items():
            task.get_logger().report_scalar("val", k, v, iteration=trainer.epoch)
        # 如果是第一个 epoch,报告模型信息给日志记录器
        if trainer.epoch == 0:
            from ultralytics.utils.torch_utils import model_info_for_loggers

            for k, v in model_info_for_loggers(trainer).items():
                task.get_logger().report_single_value(k, v)


def on_val_end(validator):
    """Logs validation results including labels and predictions."""
    # 如果存在当前任务对象
    if Task.current_task():
        # 记录验证结果的标签和预测
        _log_debug_samples(sorted(validator.save_dir.glob("val*.jpg")), "Validation")


def on_train_end(trainer):
    """Logs final model and its name on training completion."""
    # 获取当前任务对象,如果存在
    if task := Task.current_task():
        # 记录最终结果,如混淆矩阵和精确率-召回率曲线
        files = [
            "results.png",
            "confusion_matrix.png",
            "confusion_matrix_normalized.png",
            *(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),
        ]
        # 过滤存在的文件
        files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()]  # filter
        for f in files:
            _log_plot(title=f.stem, plot_path=f)
        # 报告最终指标
        for k, v in trainer.validator.metrics.results_dict.items():
            task.get_logger().report_single_value(k, v)
        # 记录最终模型
        task.update_output_model(model_path=str(trainer.best), model_name=trainer.args.name, auto_delete_file=False)


callbacks = (
    {
        "on_pretrain_routine_start": on_pretrain_routine_start,
        "on_train_epoch_end": on_train_epoch_end,
        "on_fit_epoch_end": on_fit_epoch_end,
        "on_val_end": on_val_end,
        "on_train_end": on_train_end,
    }
    if clearml
    else {}
)

.\yolov8\ultralytics\utils\callbacks\comet.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license

# 导入必要的模块和变量
from ultralytics.utils import LOGGER, RANK, SETTINGS, TESTS_RUNNING, ops

try:
    # 确保在运行 pytest 测试时不进行日志记录
    assert not TESTS_RUNNING  
    # 验证 Comet 整合已启用
    assert SETTINGS["comet"] is True  

    # 尝试导入 comet_ml 库,并验证其版本是否存在
    import comet_ml
    assert hasattr(comet_ml, "__version__")  

    import os
    from pathlib import Path

    # 确保特定的日志函数仅适用于支持的任务
    COMET_SUPPORTED_TASKS = ["detect"]

    # YOLOv8 创建的记录到 Comet 的图表名称
    EVALUATION_PLOT_NAMES = "F1_curve", "P_curve", "R_curve", "PR_curve", "confusion_matrix"
    LABEL_PLOT_NAMES = "labels", "labels_correlogram"

    _comet_image_prediction_count = 0

except (ImportError, AssertionError):
    # 如果导入失败或断言失败,则设置 comet_ml 为 None
    comet_ml = None


def _get_comet_mode():
    """返回在环境变量中设置的 Comet 模式,如果未设置则默认为 'online'。"""
    return os.getenv("COMET_MODE", "online")


def _get_comet_model_name():
    """返回 Comet 的模型名称,从环境变量 'COMET_MODEL_NAME' 获取,如果未设置则默认为 'YOLOv8'。"""
    return os.getenv("COMET_MODEL_NAME", "YOLOv8")


def _get_eval_batch_logging_interval():
    """从环境变量中获取评估批次的日志记录间隔,如果未设置则使用默认值 1。"""
    return int(os.getenv("COMET_EVAL_BATCH_LOGGING_INTERVAL", 1))


def _get_max_image_predictions_to_log():
    """从环境变量中获取要记录的最大图像预测数。"""
    return int(os.getenv("COMET_MAX_IMAGE_PREDICTIONS", 100))


def _scale_confidence_score(score):
    """按环境变量中指定的因子对给定的置信度分数进行缩放。"""
    scale = float(os.getenv("COMET_MAX_CONFIDENCE_SCORE", 100.0))
    return score * scale


def _should_log_confusion_matrix():
    """根据环境变量的设置确定是否记录混淆矩阵。"""
    return os.getenv("COMET_EVAL_LOG_CONFUSION_MATRIX", "false").lower() == "true"


def _should_log_image_predictions():
    """根据指定的环境变量确定是否记录图像预测。"""
    return os.getenv("COMET_EVAL_LOG_IMAGE_PREDICTIONS", "true").lower() == "true"


def _get_experiment_type(mode, project_name):
    """根据模式和项目名称返回一个实验对象。"""
    if mode == "offline":
        return comet_ml.OfflineExperiment(project_name=project_name)

    return comet_ml.Experiment(project_name=project_name)


def _create_experiment(args):
    """确保在分布式训练期间只在单个进程中创建实验对象。"""
    if RANK not in {-1, 0}:
        return
    try:
        # 获取当前 Comet 模式(如果存在)
        comet_mode = _get_comet_mode()
        # 获取 Comet 项目名称,如果未设置则使用参数中的项目名称
        _project_name = os.getenv("COMET_PROJECT_NAME", args.project)
        # 根据 Comet 模式和项目名称获取实验对象
        experiment = _get_experiment_type(comet_mode, _project_name)
        # 记录命令行参数到 Comet 实验中
        experiment.log_parameters(vars(args))
        # 记录其他参数到 Comet 实验中,包括批次评估日志间隔、是否记录混淆矩阵、是否记录图像预测及最大图像预测数量等
        experiment.log_others(
            {
                "eval_batch_logging_interval": _get_eval_batch_logging_interval(),
                "log_confusion_matrix_on_eval": _should_log_confusion_matrix(),
                "log_image_predictions": _should_log_image_predictions(),
                "max_image_predictions": _get_max_image_predictions_to_log(),
            }
        )
        # 记录额外信息到 Comet 实验中,指明由 yolov8 创建
        experiment.log_other("Created from", "yolov8")

    except Exception as e:
        # 异常处理:Comet 安装但初始化失败时发出警告,不记录当前运行
        LOGGER.warning(f"WARNING ⚠️ Comet installed but not initialized correctly, not logging this run. {e}")
# 返回训练器的元数据,包括当前轮次和资产保存状态
def _fetch_trainer_metadata(trainer):
    # 获取当前轮次(加1是因为epoch从0开始计数)
    curr_epoch = trainer.epoch + 1

    # 计算每个轮次的训练步数
    train_num_steps_per_epoch = len(trainer.train_loader.dataset) // trainer.batch_size
    curr_step = curr_epoch * train_num_steps_per_epoch
    # 判断是否是最后一个轮次
    final_epoch = curr_epoch == trainer.epochs

    # 读取训练器参数
    save = trainer.args.save
    save_period = trainer.args.save_period
    # 判断是否需要保存资产
    save_interval = curr_epoch % save_period == 0
    save_assets = save and save_period > 0 and save_interval and not final_epoch

    # 返回元数据字典
    return dict(curr_epoch=curr_epoch, curr_step=curr_step, save_assets=save_assets, final_epoch=final_epoch)


# 将边界框缩放到原始图像形状的比例
def _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad):
    """
    YOLOv8 在训练期间调整图像大小,并且基于这些调整大小的形状对标签值进行了归一化。

    此函数将边界框标签重新缩放到原始图像形状。
    """

    resized_image_height, resized_image_width = resized_image_shape

    # 将归一化的xywh格式预测转换为调整大小后的xyxy格式
    box = ops.xywhn2xyxy(box, h=resized_image_height, w=resized_image_width)
    # 将边界框预测从调整大小的图像尺度缩放回原始图像尺度
    box = ops.scale_boxes(resized_image_shape, box, original_image_shape, ratio_pad)
    # 将边界框格式从xyxy转换为xywh,用于Comet日志记录
    box = ops.xyxy2xywh(box)
    # 调整xy中心以对应左上角
    box[:2] -= box[2:] / 2
    box = box.tolist()

    return box


# 为检测格式化真实标注注释
def _format_ground_truth_annotations_for_detection(img_idx, image_path, batch, class_name_map=None):
    """格式化用于检测的真实标注。"""
    # 获取与当前图像索引匹配的批次索引
    indices = batch["batch_idx"] == img_idx
    # 获取边界框标签
    bboxes = batch["bboxes"][indices]
    if len(bboxes) == 0:
        LOGGER.debug(f"COMET WARNING: Image: {image_path} has no bounding boxes labels")
        return None

    # 获取类别标签
    cls_labels = batch["cls"][indices].squeeze(1).tolist()
    if class_name_map:
        cls_labels = [str(class_name_map[label]) for label in cls_labels]

    # 获取原始图像形状、调整大小的图像形状和填充比例
    original_image_shape = batch["ori_shape"][img_idx]
    resized_image_shape = batch["resized_shape"][img_idx]
    ratio_pad = batch["ratio_pad"][img_idx]

    data = []
    for box, label in zip(bboxes, cls_labels):
        # 将边界框缩放到原始图像形状
        box = _scale_bounding_box_to_original_image_shape(box, resized_image_shape, original_image_shape, ratio_pad)
        data.append(
            {
                "boxes": [box],
                "label": f"gt_{label}",
                "score": _scale_confidence_score(1.0),
            }
        )

    return {"name": "ground_truth", "data": data}


# 为检测格式化YOLO预测注释
def _format_prediction_annotations_for_detection(image_path, metadata, class_label_map=None):
    """格式化用于对象检测可视化的YOLO预测。"""
    # 获取图像文件名(不带后缀)
    stem = image_path.stem
    image_id = int(stem) if stem.isnumeric() else stem

    # 获取指定图像的预测结果
    predictions = metadata.get(image_id)
    # 如果predictions为空列表,则记录警告并返回None
    if not predictions:
        LOGGER.debug(f"COMET WARNING: Image: {image_path} has no bounding boxes predictions")
        return None

    # 初始化一个空列表,用于存储处理后的预测数据
    data = []

    # 遍历每个预测结果
    for prediction in predictions:
        # 获取预测框的坐标信息
        boxes = prediction["bbox"]
        # 调整预测得分的置信度,并保存到score变量中
        score = _scale_confidence_score(prediction["score"])
        # 获取预测类别的标签ID
        cls_label = prediction["category_id"]
        
        # 如果提供了类别映射字典,则将标签ID转换为相应的字符串标签
        if class_label_map:
            cls_label = str(class_label_map[cls_label])
        
        # 将处理后的预测数据以字典形式添加到data列表中
        data.append({"boxes": [boxes], "label": cls_label, "score": score})

    # 返回一个包含预测名称和处理后数据的字典
    return {"name": "prediction", "data": data}
# 将图像索引、图像路径、批次、预测元数据映射和类标签映射格式化为检测任务的地面真实注释
def _fetch_annotations(img_idx, image_path, batch, prediction_metadata_map, class_label_map):
    ground_truth_annotations = _format_ground_truth_annotations_for_detection(
        img_idx, image_path, batch, class_label_map
    )
    # 根据图像路径和预测元数据映射格式化预测注释
    prediction_annotations = _format_prediction_annotations_for_detection(
        image_path, prediction_metadata_map, class_label_map
    )

    # 将地面真实注释和预测注释合并到一个列表中(排除为空的注释)
    annotations = [
        annotation for annotation in [ground_truth_annotations, prediction_annotations] if annotation is not None
    ]
    return [annotations] if annotations else None


# 创建基于图像 ID 分组的模型预测元数据映射
def _create_prediction_metadata_map(model_predictions):
    pred_metadata_map = {}
    for prediction in model_predictions:
        pred_metadata_map.setdefault(prediction["image_id"], [])
        pred_metadata_map[prediction["image_id"]].append(prediction)

    return pred_metadata_map


# 将混淆矩阵记录到 Comet 实验中
def _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch):
    conf_mat = trainer.validator.confusion_matrix.matrix
    names = list(trainer.data["names"].values()) + ["background"]
    # 记录混淆矩阵到 Comet 实验中
    experiment.log_confusion_matrix(
        matrix=conf_mat, labels=names, max_categories=len(names), epoch=curr_epoch, step=curr_step
    )


# 记录图像到 Comet 实验中,可以选择包含注释
def _log_images(experiment, image_paths, curr_step, annotations=None):
    if annotations:
        # 对于每个图像路径和对应的注释,记录图像到 Comet 实验中
        for image_path, annotation in zip(image_paths, annotations):
            experiment.log_image(image_path, name=image_path.stem, step=curr_step, annotations=annotation)
    else:
        # 对于每个图像路径,记录图像到 Comet 实验中
        for image_path in image_paths:
            experiment.log_image(image_path, name=image_path.stem, step=curr_step)


# 在训练期间记录单个图像的预测框到 Comet 实验中
def _log_image_predictions(experiment, validator, curr_step):
    global _comet_image_prediction_count

    task = validator.args.task
    if task not in COMET_SUPPORTED_TASKS:
        return

    jdict = validator.jdict
    if not jdict:
        return

    # 创建预测元数据映射
    predictions_metadata_map = _create_prediction_metadata_map(jdict)
    dataloader = validator.dataloader
    class_label_map = validator.names

    # 获取评估批次记录间隔和最大要记录的图像预测数量
    batch_logging_interval = _get_eval_batch_logging_interval()
    max_image_predictions = _get_max_image_predictions_to_log()
    # 遍历数据加载器中的每个批次和批次索引
    for batch_idx, batch in enumerate(dataloader):
        # 如果当前批次索引不是批次日志间隔的整数倍,跳过本次循环
        if (batch_idx + 1) % batch_logging_interval != 0:
            continue

        # 获取当前批次中图像文件路径列表
        image_paths = batch["im_file"]
        
        # 遍历当前批次中的每张图像和图像索引
        for img_idx, image_path in enumerate(image_paths):
            # 如果已记录的Comet图像预测次数超过了最大预测数,函数结束
            if _comet_image_prediction_count >= max_image_predictions:
                return

            # 将图像路径转换为Path对象
            image_path = Path(image_path)
            
            # 获取图像的注释信息,调用_fetch_annotations函数
            annotations = _fetch_annotations(
                img_idx,
                image_path,
                batch,
                predictions_metadata_map,
                class_label_map,
            )
            
            # 记录图像及其注释到Comet实验中,调用_log_images函数
            _log_images(
                experiment,
                [image_path],
                curr_step,
                annotations=annotations,
            )
            
            # 增加已记录的Comet图像预测次数计数器
            _comet_image_prediction_count += 1
# 在实验和训练器上记录评估图和标签图的函数
def _log_plots(experiment, trainer):
    # 根据评估图的名称列表生成图像文件名列表
    plot_filenames = [trainer.save_dir / f"{plots}.png" for plots in EVALUATION_PLOT_NAMES]
    # 调用_log_images函数记录评估图像到实验中
    _log_images(experiment, plot_filenames, None)

    # 根据标签图的名称列表生成图像文件名列表
    label_plot_filenames = [trainer.save_dir / f"{labels}.jpg" for labels in LABEL_PLOT_NAMES]
    # 调用_log_images函数记录标签图像到实验中
    _log_images(experiment, label_plot_filenames, None)


# 记录最佳训练模型到Comet.ml的函数
def _log_model(experiment, trainer):
    # 获取要记录的模型的名称
    model_name = _get_comet_model_name()
    # 调用experiment.log_model函数将最佳模型记录到Comet.ml
    experiment.log_model(model_name, file_or_folder=str(trainer.best), file_name="best.pt", overwrite=True)


# 在YOLO预训练过程开始时创建或恢复CometML实验的函数
def on_pretrain_routine_start(trainer):
    # 获取全局的CometML实验对象
    experiment = comet_ml.get_global_experiment()
    # 检查实验是否存在并且处于活跃状态
    is_alive = getattr(experiment, "alive", False)
    # 如果实验不存在或不处于活跃状态,则创建新的实验
    if not experiment or not is_alive:
        _create_experiment(trainer.args)


# 在每个训练周期结束时记录指标和批次图像的函数
def on_train_epoch_end(trainer):
    # 获取全局的CometML实验对象
    experiment = comet_ml.get_global_experiment()
    # 如果实验对象不存在,则直接返回
    if not experiment:
        return

    # 获取训练器的元数据
    metadata = _fetch_trainer_metadata(trainer)
    curr_epoch = metadata["curr_epoch"]
    curr_step = metadata["curr_step"]

    # 记录训练损失相关的指标到CometML
    experiment.log_metrics(trainer.label_loss_items(trainer.tloss, prefix="train"), step=curr_step, epoch=curr_epoch)

    # 如果当前是第一个训练周期,记录训练批次图像到CometML
    if curr_epoch == 1:
        _log_images(experiment, trainer.save_dir.glob("train_batch*.jpg"), curr_step)


# 在每个训练周期完成时记录模型资产的函数
def on_fit_epoch_end(trainer):
    # 获取全局的CometML实验对象
    experiment = comet_ml.get_global_experiment()
    # 如果实验对象不存在,则直接返回
    if not experiment:
        return

    # 获取训练器的元数据
    metadata = _fetch_trainer_metadata(trainer)
    curr_epoch = metadata["curr_epoch"]
    curr_step = metadata["curr_step"]
    save_assets = metadata["save_assets"]

    # 记录训练器的指标到CometML
    experiment.log_metrics(trainer.metrics, step=curr_step, epoch=curr_epoch)
    experiment.log_metrics(trainer.lr, step=curr_step, epoch=curr_epoch)

    # 如果当前是第一个训练周期,记录模型信息到CometML
    if curr_epoch == 1:
        from ultralytics.utils.torch_utils import model_info_for_loggers
        experiment.log_metrics(model_info_for_loggers(trainer), step=curr_step, epoch=curr_epoch)

    # 如果不保存资产,则直接返回
    if not save_assets:
        return

    # 记录最佳模型到CometML
    _log_model(experiment, trainer)

    # 如果应记录混淆矩阵,则记录混淆矩阵到CometML
    if _should_log_confusion_matrix():
        _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)

    # 如果应记录图像预测,则记录图像预测到CometML
    if _should_log_image_predictions():
        _log_image_predictions(experiment, trainer.validator, curr_step)


# 在训练结束时执行的操作的函数
def on_train_end(trainer):
    # 获取全局的CometML实验对象
    experiment = comet_ml.get_global_experiment()
    # 如果实验对象不存在,则直接返回
    if not experiment:
        return

    # 获取训练器的元数据
    metadata = _fetch_trainer_metadata(trainer)
    curr_epoch = metadata["curr_epoch"]
    curr_step = metadata["curr_step"]
    plots = trainer.args.plots

    # 记录最佳模型到CometML
    _log_model(experiment, trainer)

    # 如果应记录图表,则记录图表到CometML
    if plots:
        _log_plots(experiment, trainer)

    # 记录混淆矩阵到CometML
    _log_confusion_matrix(experiment, trainer, curr_step, curr_epoch)
    # 记录图像预测结果到日志,使用当前实验和验证器的信息和当前步骤数
    _log_image_predictions(experiment, trainer.validator, curr_step)
    # 结束当前实验,进行必要的清理工作
    experiment.end()
    
    # 设置全局变量 _comet_image_prediction_count 为 0,用于统计图像预测的数量
    global _comet_image_prediction_count
    _comet_image_prediction_count = 0
# 如果 comet_ml 变量为真,则定义一个包含多个回调函数的字典,否则定义一个空字典
callbacks = (
    {
        "on_pretrain_routine_start": on_pretrain_routine_start,
        "on_train_epoch_end": on_train_epoch_end,
        "on_fit_epoch_end": on_fit_epoch_end,
        "on_train_end": on_train_end,
    }
    if comet_ml
    else {}
)

.\yolov8\ultralytics\utils\callbacks\dvc.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license

# 导入必要的模块和变量
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING, checks

try:
    # 确保不在运行 pytest 时记录日志
    assert not TESTS_RUNNING
    # 确保集成设置已启用
    assert SETTINGS["dvc"] is True
    # 尝试导入 dvclive
    import dvclive

    # 检查 dvclive 版本是否符合要求
    assert checks.check_version("dvclive", "2.11.0", verbose=True)

    import os
    import re
    from pathlib import Path

    # DVCLive 日志实例
    live = None
    # 记录已处理的绘图
    _processed_plots = {}

    # `on_fit_epoch_end` 在最终验证时被调用(可能需要修复),目前是我们区分最佳模型的最终评估与最后一个 epoch 验证的方式
    _training_epoch = False

except (ImportError, AssertionError, TypeError):
    # 捕获异常,设定 dvclive 为 None
    dvclive = None


def _log_images(path, prefix=""):
    """使用 DVCLive 记录指定路径下的图像,可选添加前缀。"""
    if live:
        name = path.name

        # 根据批次分组图像,以便在用户界面中使用滑块浏览
        if m := re.search(r"_batch(\d+)", name):
            ni = m[1]
            new_stem = re.sub(r"_batch(\d+)", "_batch", path.stem)
            name = (Path(new_stem) / ni).with_suffix(path.suffix)

        live.log_image(os.path.join(prefix, name), path)


def _log_plots(plots, prefix=""):
    """记录训练进度的绘图,如果尚未处理过。"""
    for name, params in plots.items():
        timestamp = params["timestamp"]
        if _processed_plots.get(name) != timestamp:
            _log_images(name, prefix)
            _processed_plots[name] = timestamp


def _log_confusion_matrix(validator):
    """使用 DVCLive 记录给定验证器的混淆矩阵。"""
    targets = []
    preds = []
    matrix = validator.confusion_matrix.matrix
    names = list(validator.names.values())
    if validator.confusion_matrix.task == "detect":
        names += ["background"]

    for ti, pred in enumerate(matrix.T.astype(int)):
        for pi, num in enumerate(pred):
            targets.extend([names[ti]] * num)
            preds.extend([names[pi]] * num)

    live.log_sklearn_plot("confusion_matrix", targets, preds, name="cf.json", normalized=True)


def on_pretrain_routine_start(trainer):
    """在预训练过程开始时初始化 DVCLive 记录器,用于记录训练元数据。"""
    try:
        global live
        live = dvclive.Live(save_dvc_exp=True, cache_images=True)
        LOGGER.info("DVCLive is detected and auto logging is enabled (run 'yolo settings dvc=False' to disable).")
    except Exception as e:
        LOGGER.warning(f"WARNING ⚠️ DVCLive installed but not initialized correctly, not logging this run. {e}")


def on_pretrain_routine_end(trainer):
    """在预训练过程结束时记录与训练进程相关的绘图。"""
    _log_plots(trainer.plots, "train")


def on_train_start(trainer):
    """如果 DVCLive 记录器处于活动状态,则记录训练参数。"""
    if live:
        live.log_params(trainer.args)


def on_train_epoch_start(trainer):
    # 这里留空,可能在后续实现具体功能
    # 设置全局变量 _training_epoch 在每个训练周期开始时为 True
    global _training_epoch
    # 将 _training_epoch 设置为 True,指示当前处于训练周期中
    _training_epoch = True
def on_fit_epoch_end(trainer):
    """Logs training metrics and model info, and advances to next step on the end of each fit epoch."""
    global _training_epoch
    if live and _training_epoch:
        # Collect all training metrics including loss, custom metrics, and learning rate
        all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
        # Log each metric to DVCLive
        for metric, value in all_metrics.items():
            live.log_metric(metric, value)

        # Log model information if it's the first epoch
        if trainer.epoch == 0:
            from ultralytics.utils.torch_utils import model_info_for_loggers
            # Log model-specific information to DVCLive
            for metric, value in model_info_for_loggers(trainer).items():
                live.log_metric(metric, value, plot=False)

        # Log training plots
        _log_plots(trainer.plots, "train")
        # Log validation plots
        _log_plots(trainer.validator.plots, "val")

        # Advance to the next step in the training process
        live.next_step()
        _training_epoch = False


def on_train_end(trainer):
    """Logs the best metrics, plots, and confusion matrix at the end of training if DVCLive is active."""
    if live:
        # Log all final training metrics including loss, custom metrics, and learning rate
        all_metrics = {**trainer.label_loss_items(trainer.tloss, prefix="train"), **trainer.metrics, **trainer.lr}
        # Log each metric to DVCLive
        for metric, value in all_metrics.items():
            live.log_metric(metric, value, plot=False)

        # Log validation plots
        _log_plots(trainer.plots, "val")
        # Log validation plots from validator
        _log_plots(trainer.validator.plots, "val")

        # Log confusion matrix for validation data
        _log_confusion_matrix(trainer.validator)

        # If there exists a best model artifact, log it to DVCLive
        if trainer.best.exists():
            live.log_artifact(trainer.best, copy=True, type="model")

        # End the DVCLive logging session
        live.end()


callbacks = (
    {
        "on_pretrain_routine_start": on_pretrain_routine_start,
        "on_pretrain_routine_end": on_pretrain_routine_end,
        "on_train_start": on_train_start,
        "on_train_epoch_start": on_train_epoch_start,
        "on_fit_epoch_end": on_fit_epoch_end,
        "on_train_end": on_train_end,
    }
    if dvclive
    else {}
)

.\yolov8\ultralytics\utils\callbacks\hub.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license

import json
from time import time

from ultralytics.hub import HUB_WEB_ROOT, PREFIX, HUBTrainingSession, events
from ultralytics.utils import LOGGER, RANK, SETTINGS


def on_pretrain_routine_start(trainer):
    """Create a remote Ultralytics HUB session to log local model training."""
    # 检查是否处于主进程或单进程训练,且设置中允许使用 HUB,并且有有效的 API 密钥,且未创建会话
    if RANK in {-1, 0} and SETTINGS["hub"] is True and SETTINGS["api_key"] and trainer.hub_session is None:
        # 创建一个基于训练模型和参数的 HUBTrainingSession 对象
        trainer.hub_session = HUBTrainingSession.create_session(trainer.args.model, trainer.args)


def on_pretrain_routine_end(trainer):
    """Logs info before starting timer for upload rate limit."""
    session = getattr(trainer, "hub_session", None)
    if session:
        # 开始计时器以控制上传速率限制
        session.timers = {"metrics": time(), "ckpt": time()}  # 在 session.rate_limit 上启动计时器


def on_fit_epoch_end(trainer):
    """Uploads training progress metrics at the end of each epoch."""
    session = getattr(trainer, "hub_session", None)
    if session:
        # 在验证结束后上传度量指标
        all_plots = {
            **trainer.label_loss_items(trainer.tloss, prefix="train"),
            **trainer.metrics,
        }
        if trainer.epoch == 0:
            from ultralytics.utils.torch_utils import model_info_for_loggers

            # 在第一个 epoch 时,添加模型信息到上传队列中的度量指标
            all_plots = {**all_plots, **model_info_for_loggers(trainer)}

        # 将所有度量指标转换为 JSON 格式并加入度量队列
        session.metrics_queue[trainer.epoch] = json.dumps(all_plots)

        # 如果度量指标上传失败,将它们加入失败队列以便再次尝试上传
        if session.metrics_upload_failed_queue:
            session.metrics_queue.update(session.metrics_upload_failed_queue)

        # 如果超过度量上传速率限制时间间隔,执行上传度量指标操作并重置计时器和队列
        if time() - session.timers["metrics"] > session.rate_limits["metrics"]:
            session.upload_metrics()
            session.timers["metrics"] = time()  # 重置计时器
            session.metrics_queue = {}  # 重置队列


def on_model_save(trainer):
    """Saves checkpoints to Ultralytics HUB with rate limiting."""
    session = getattr(trainer, "hub_session", None)
    if session:
        # 使用速率限制上传检查点
        is_best = trainer.best_fitness == trainer.fitness
        if time() - session.timers["ckpt"] > session.rate_limits["ckpt"]:
            # 记录检查点上传信息并上传模型
            LOGGER.info(f"{PREFIX}Uploading checkpoint {HUB_WEB_ROOT}/models/{session.model.id}")
            session.upload_model(trainer.epoch, trainer.last, is_best)
            session.timers["ckpt"] = time()  # 重置计时器


def on_train_end(trainer):
    """Upload final model and metrics to Ultralytics HUB at the end of training."""
    session = getattr(trainer, "hub_session", None)
    # 如果会话存在,则执行以下操作
    if session:
        # 记录信息日志,显示同步最终模型的进度
        LOGGER.info(f"{PREFIX}Syncing final model...")
        # 通过会话对象上传最终模型和指标,使用指数抵消法
        session.upload_model(
            trainer.epoch,  # 上传训练器的当前周期数
            trainer.best,   # 上传训练器的最佳模型
            map=trainer.metrics.get("metrics/mAP50-95(B)", 0),  # 上传训练器的指定指标
            final=True,     # 标记为最终模型
        )
        # 停止心跳信息发送
        session.alive = False  # 将会话对象的 alive 属性设为 False
        # 记录信息日志,显示操作完成和模型的访问链接
        LOGGER.info(f"{PREFIX}Done ✅\n" f"{PREFIX}View model at {session.model_url} 🚀")
# 定义在训练开始时运行的回调函数,调用 events 函数并传递 trainer 的参数
def on_train_start(trainer):
    """Run events on train start."""
    events(trainer.args)


# 定义在验证开始时运行的回调函数,调用 events 函数并传递 validator 的参数
def on_val_start(validator):
    """Runs events on validation start."""
    events(validator.args)


# 定义在预测开始时运行的回调函数,调用 events 函数并传递 predictor 的参数
def on_predict_start(predictor):
    """Run events on predict start."""
    events(predictor.args)


# 定义在导出开始时运行的回调函数,调用 events 函数并传递 exporter 的参数
def on_export_start(exporter):
    """Run events on export start."""
    events(exporter.args)


# 根据 SETTINGS["hub"] 的值决定是否启用回调函数,如果启用则初始化一个包含不同回调函数的字典,否则为空字典
callbacks = (
    {
        "on_pretrain_routine_start": on_pretrain_routine_start,
        "on_pretrain_routine_end": on_pretrain_routine_end,
        "on_fit_epoch_end": on_fit_epoch_end,
        "on_model_save": on_model_save,
        "on_train_end": on_train_end,
        "on_train_start": on_train_start,
        "on_val_start": on_val_start,
        "on_predict_start": on_predict_start,
        "on_export_start": on_export_start,
    }
    if SETTINGS["hub"] is True
    else {}
)  # verify enabled

.\yolov8\ultralytics\utils\callbacks\mlflow.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license
"""
MLflow Logging for Ultralytics YOLO.

This module enables MLflow logging for Ultralytics YOLO. It logs metrics, parameters, and model artifacts.
For setting up, a tracking URI should be specified. The logging can be customized using environment variables.

Commands:
    1. To set a project name:
        `export MLFLOW_EXPERIMENT_NAME=<your_experiment_name>` or use the project=<project> argument

    2. To set a run name:
        `export MLFLOW_RUN=<your_run_name>` or use the name=<name> argument

    3. To start a local MLflow server:
        mlflow server --backend-store-uri runs/mlflow
       It will by default start a local server at http://127.0.0.1:5000.
       To specify a different URI, set the MLFLOW_TRACKING_URI environment variable.

    4. To kill all running MLflow server instances:
        ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9
"""

from ultralytics.utils import LOGGER, RUNS_DIR, SETTINGS, TESTS_RUNNING, colorstr

try:
    import os

    assert not TESTS_RUNNING or "test_mlflow" in os.environ.get("PYTEST_CURRENT_TEST", "")  # do not log pytest
    assert SETTINGS["mlflow"] is True  # verify integration is enabled
    import mlflow

    assert hasattr(mlflow, "__version__")  # verify package is not directory
    from pathlib import Path

    PREFIX = colorstr("MLflow: ")

except (ImportError, AssertionError):
    mlflow = None


def sanitize_dict(x):
    """Sanitize dictionary keys by removing parentheses and converting values to floats."""
    return {k.replace("(", "").replace(")", ""): float(v) for k, v in x.items()}


def on_pretrain_routine_end(trainer):
    """
    Log training parameters to MLflow at the end of the pretraining routine.

    This function sets up MLflow logging based on environment variables and trainer arguments. It sets the tracking URI,
    experiment name, and run name, then starts the MLflow run if not already active. It finally logs the parameters
    from the trainer.

    Args:
        trainer (ultralytics.engine.trainer.BaseTrainer): The training object with arguments and parameters to log.

    Global:
        mlflow: The imported mlflow module to use for logging.

    Environment Variables:
        MLFLOW_TRACKING_URI: The URI for MLflow tracking. If not set, defaults to 'runs/mlflow'.
        MLFLOW_EXPERIMENT_NAME: The name of the MLflow experiment. If not set, defaults to trainer.args.project.
        MLFLOW_RUN: The name of the MLflow run. If not set, defaults to trainer.args.name.
        MLFLOW_KEEP_RUN_ACTIVE: Boolean indicating whether to keep the MLflow run active after the end of training.
    """
    global mlflow

    # 获取 MLflow 追踪的 URI,如果未设置,则默认为 RUNS_DIR 下的 'mlflow'
    uri = os.environ.get("MLFLOW_TRACKING_URI") or str(RUNS_DIR / "mlflow")
    LOGGER.debug(f"{PREFIX} tracking uri: {uri}")
    # 设置 MLflow 追踪 URI
    mlflow.set_tracking_uri(uri)

    # 设置实验名称和运行名称
    # 如果环境变量中未设置 MLFLOW_EXPERIMENT_NAME,则默认使用 trainer.args.project 或者 '/Shared/YOLOv8'
    experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or trainer.args.project or "/Shared/YOLOv8"
    # 获取运行名称,优先从环境变量中获取,否则使用 trainer 的参数中的名称
    run_name = os.environ.get("MLFLOW_RUN") or trainer.args.name
    
    # 设置 MLflow 实验名称
    mlflow.set_experiment(experiment_name)
    
    # 自动记录所有的参数和指标
    mlflow.autolog()
    
    try:
        # 获取当前活跃的 MLflow 运行,如果没有则启动一个新的运行,使用指定的运行名称
        active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name)
        
        # 记录运行 ID 到日志中
        LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}")
        
        # 如果指定的 URI 是一个目录,则记录一个查看 URI 的信息,包括本地访问地址
        if Path(uri).is_dir():
            LOGGER.info(f"{PREFIX}view at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri {uri}'")
        
        # 提示如何禁用 MLflow 记录
        LOGGER.info(f"{PREFIX}disable with 'yolo settings mlflow=False'")
        
        # 记录所有 trainer 参数到 MLflow 的参数日志中
        mlflow.log_params(dict(trainer.args))
    
    except Exception as e:
        # 如果出现异常,记录警告日志,提示初始化失败,并不跟踪这次运行
        LOGGER.warning(f"{PREFIX}WARNING ⚠️ Failed to initialize: {e}\n" f"{PREFIX}WARNING ⚠️ Not tracking this run")
# 在每个训练周期结束时将训练指标记录到 MLflow 中
def on_train_epoch_end(trainer):
    """Log training metrics at the end of each train epoch to MLflow."""
    # 检查是否启用了 MLflow
    if mlflow:
        # 将训练学习率和标签损失项的指标进行处理和记录
        mlflow.log_metrics(
            metrics={
                **sanitize_dict(trainer.lr),
                **sanitize_dict(trainer.label_loss_items(trainer.tloss, prefix="train")),
            },
            step=trainer.epoch,
        )


# 在每个拟合(fit)周期结束时将训练指标记录到 MLflow 中
def on_fit_epoch_end(trainer):
    """Log training metrics at the end of each fit epoch to MLflow."""
    # 检查是否启用了 MLflow
    if mlflow:
        # 将拟合周期的指标进行处理和记录
        mlflow.log_metrics(metrics=sanitize_dict(trainer.metrics), step=trainer.epoch)


# 在训练结束时记录模型工件到 MLflow
def on_train_end(trainer):
    """Log model artifacts at the end of the training."""
    # 如果没有启用 MLflow,则直接返回
    if not mlflow:
        return
    # 记录最佳模型和其他文件到 MLflow
    mlflow.log_artifact(str(trainer.best.parent))  # 记录最佳模型目录中的 weights 文件夹(包含 best.pt 和 last.pt)
    for f in trainer.save_dir.glob("*"):  # 记录保存目录中的所有其他文件
        if f.suffix in {".png", ".jpg", ".csv", ".pt", ".yaml"}:
            mlflow.log_artifact(str(f))
    # 检查是否需要保持 MLflow 运行活动状态
    keep_run_active = os.environ.get("MLFLOW_KEEP_RUN_ACTIVE", "False").lower() == "true"
    if keep_run_active:
        LOGGER.info(f"{PREFIX}mlflow run still alive, remember to close it using mlflow.end_run()")
    else:
        mlflow.end_run()
        LOGGER.debug(f"{PREFIX}mlflow run ended")

    # 记录 MLflow 结果的输出信息
    LOGGER.info(
        f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n{PREFIX}disable with 'yolo settings mlflow=False'"
    )


# 如果启用了 MLflow,则配置相应的回调函数
callbacks = (
    {
        "on_pretrain_routine_end": on_pretrain_routine_end,
        "on_train_epoch_end": on_train_epoch_end,
        "on_fit_epoch_end": on_fit_epoch_end,
        "on_train_end": on_train_end,
    }
    if mlflow
    else {}  # 如果未启用 MLflow,则回调为空字典
)

.\yolov8\ultralytics\utils\callbacks\neptune.py

代码语言:javascript
复制
# Ultralytics YOLO 🚀, AGPL-3.0 license

# 从 ultralytics.utils 模块导入 LOGGER、SETTINGS 和 TESTS_RUNNING
from ultralytics.utils import LOGGER, SETTINGS, TESTS_RUNNING

# 尝试检查测试是否运行,不记录 pytest 测试
try:
    assert not TESTS_RUNNING  
    # 确认 SETTINGS 中的 neptune 设置为 True,验证 Neptune 集成已启用
    assert SETTINGS["neptune"] is True  
    import neptune
    from neptune.types import File

    assert hasattr(neptune, "__version__")

    run = None  # NeptuneAI 实验记录器实例

except (ImportError, AssertionError):
    neptune = None


def _log_scalars(scalars, step=0):
    """Log scalars to the NeptuneAI experiment logger."""
    # 如果 run 不为 None,将标量写入 NeptuneAI 实验记录器
    if run:
        for k, v in scalars.items():
            run[k].append(value=v, step=step)


def _log_images(imgs_dict, group=""):
    """Log scalars to the NeptuneAI experiment logger."""
    # 如果 run 不为 None,上传图像到 NeptuneAI 实验记录器
    if run:
        for k, v in imgs_dict.items():
            run[f"{group}/{k}"].upload(File(v))


def _log_plot(title, plot_path):
    """
    Log plots to the NeptuneAI experiment logger.

    Args:
        title (str): 图表的标题.
        plot_path (PosixPath | str): 图像文件的路径.
    """
    import matplotlib.image as mpimg
    import matplotlib.pyplot as plt

    # 读取图像文件
    img = mpimg.imread(plot_path)
    # 创建新的图表
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1], frameon=False, aspect="auto", xticks=[], yticks=[])  # 不显示刻度
    ax.imshow(img)
    # 上传图表到 NeptuneAI 实验记录器
    run[f"Plots/{title}"].upload(fig)


def on_pretrain_routine_start(trainer):
    """Callback function called before the training routine starts."""
    try:
        global run
        # 初始化 NeptuneAI 实验记录器
        run = neptune.init_run(project=trainer.args.project or "YOLOv8", name=trainer.args.name, tags=["YOLOv8"])
        # 记录超参数配置到 NeptuneAI 实验记录器
        run["Configuration/Hyperparameters"] = {k: "" if v is None else v for k, v in vars(trainer.args).items()}
    except Exception as e:
        # 若 NeptuneAI 安装但初始化不正确,记录警告信息
        LOGGER.warning(f"WARNING ⚠️ NeptuneAI installed but not initialized correctly, not logging this run. {e}")


def on_train_epoch_end(trainer):
    """Callback function called at end of each training epoch."""
    # 记录训练损失到 NeptuneAI 实验记录器
    _log_scalars(trainer.label_loss_items(trainer.tloss, prefix="train"), trainer.epoch + 1)
    # 记录学习率到 NeptuneAI 实验记录器
    _log_scalars(trainer.lr, trainer.epoch + 1)
    # 如果是第一个 epoch,记录训练批次图像到 NeptuneAI 实验记录器中的"Mosaic"组
    if trainer.epoch == 1:
        _log_images({f.stem: str(f) for f in trainer.save_dir.glob("train_batch*.jpg")}, "Mosaic")


def on_fit_epoch_end(trainer):
    """Callback function called at end of each fit (train+val) epoch."""
    if run and trainer.epoch == 0:
        from ultralytics.utils.torch_utils import model_info_for_loggers

        # 记录模型信息到 NeptuneAI 实验记录器
        run["Configuration/Model"] = model_info_for_loggers(trainer)
    # 记录指标到 NeptuneAI 实验记录器
    _log_scalars(trainer.metrics, trainer.epoch + 1)


def on_val_end(validator):
    """Callback function called at end of each validation."""
    if run:
        # 记录验证图像到 NeptuneAI 实验记录器中的"Validation"组
        _log_images({f.stem: str(f) for f in validator.save_dir.glob("val*.jpg")}, "Validation")


def on_train_end(trainer):
    """Callback function called at end of training."""
    # 如果 run 变量为真,则执行以下操作
    if run:
        # 定义要记录的文件列表,包括结果图像和混淆矩阵等
        files = [
            "results.png",  # 结果图像文件名
            "confusion_matrix.png",  # 混淆矩阵图像文件名
            "confusion_matrix_normalized.png",  # 归一化混淆矩阵图像文件名
            *(f"{x}_curve.png" for x in ("F1", "PR", "P", "R")),  # F1、PR、P、R 曲线图像文件名
        ]
        # 使用列表推导式筛选出存在的文件路径
        files = [(trainer.save_dir / f) for f in files if (trainer.save_dir / f).exists()]  # filter
        # 遍历筛选后的文件列表,记录每个文件的标题和路径
        for f in files:
            _log_plot(title=f.stem, plot_path=f)  # 记录图像,使用文件名的基本名称作为标题
        # 记录最终的模型权重文件
        run[f"weights/{trainer.args.name or trainer.args.task}/{trainer.best.name}"].upload(File(str(trainer.best)))
# 如果 neptune 变量为真,则定义一个包含多个回调函数的字典;否则定义一个空字典。
callbacks = (
    {
        "on_pretrain_routine_start": on_pretrain_routine_start,
        "on_train_epoch_end": on_train_epoch_end,
        "on_fit_epoch_end": on_fit_epoch_end,
        "on_val_end": on_val_end,
        "on_train_end": on_train_end,
    }
    if neptune
    else {}
)
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2024-09-08,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • .\yolov8\ultralytics\utils\benchmarks.py
  • .\yolov8\ultralytics\utils\callbacks\base.py
  • .\yolov8\ultralytics\utils\callbacks\clearml.py
  • .\yolov8\ultralytics\utils\callbacks\comet.py
  • .\yolov8\ultralytics\utils\callbacks\dvc.py
  • .\yolov8\ultralytics\utils\callbacks\hub.py
  • .\yolov8\ultralytics\utils\callbacks\mlflow.py
  • .\yolov8\ultralytics\utils\callbacks\neptune.py
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档