首页
学习
活动
专区
圈层
工具
发布
社区首页 >专栏 >HAMi源码解析——HAMi-Core-2

HAMi源码解析——HAMi-Core-2

原创
作者头像
DifficultWork
修改2025-07-09 14:54:21
修改2025-07-09 14:54:21
4430
举报
文章被收录于专栏:阶梯计划阶梯计划

3 GPU 显存限制

HAMi 限制 GPU 分两部分:

3.1 NVML拦截

在 Pod 中执行 nvidia-smi 命令查看设备信息时,Memory 部分只会展示申请的值,而非设备真实内存,这部分则是通过拦截 NVML API 实现的。

当申请8G 显存时,Pod中看到的就是8G。

nvidia-smi 查看GPU Memory
nvidia-smi 查看GPU Memory

这是通过拦截 NVML 中的 _nvmlDeviceGetMemoryInfo API 实现的:

代码语言:c
复制
// src/nvml/hook.c
nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,nvmlMemory_t* memory,int version) {
    unsigned int dev_id;
    LOG_DEBUG("into nvmlDeviceGetMemoryInfo");

    switch (version){
        case 1:
            CHECK_NVML_API(NVML_OVERRIDE_CALL(nvml_library_entry,nvmlDeviceGetMemoryInfo, device, memory));
            break;
        case 2:
            CHECK_NVML_API(NVML_OVERRIDE_CALL(nvml_library_entry,nvmlDeviceGetMemoryInfo_v2, device, (nvmlMemory_v2_t *)memory));
    }
    LOG_DEBUG("origin_free=%lld total=%lld\n",memory->free,memory->total);
    CHECK_NVML_API(nvmlDeviceGetIndex(device, &dev_id));
    int cudadev = nvml_to_cuda_map(dev_id);
    if (cudadev < 0)
        return NVML_SUCCESS;
    // 核心部分
    size_t usage = get_current_device_memory_usage(cudadev);
    size_t monitor = get_current_device_memory_monitor(cudadev);
    // limit 是申请的总显存,即前面的8G
    size_t limit = get_current_device_memory_limit(cudadev);
    LOG_DEBUG("usage=%ld limit=%ld monitor=%ld",usage,limit,monitor);
    if ( memory == NULL) {
        return NVML_SUCCESS;
    }
    if (limit == 0){
        switch (version){
        case 1:
            memory->used = usage;
            return NVML_SUCCESS;
        case 2:
            ((nvmlMemory_v2_t *)memory)->used = usage;
            return NVML_SUCCESS;
        }
    } else {
        switch (version){
        case 1:
            memory->free = (limit-usage);
            memory->total = limit;
            memory->used = usage;
            return NVML_SUCCESS;
        case 2:
            ((nvmlMemory_v2_t *)memory)->used = usage;
            ((nvmlMemory_v2_t *)memory)->total = limit;
            ((nvmlMemory_v2_t *)memory)->used = usage;
            return NVML_SUCCESS;
        } 
    }
    return NVML_SUCCESS;
}

get_current_device_memory_limit 的实现:

代码语言:c
复制
// src/multiprocess/multiprocess_memory_limit.c
uint64_t get_current_device_memory_limit(const int dev) {
    ensure_initialized();
    if (dev < 0 || dev >= CUDA_DEVICE_MAX_COUNT) {
        LOG_ERROR("Illegal device id: %d", dev);
    }
    // 这里直接读取的 region_info
    return region_info.shared_region->limit[dev];       
}

HAMI 的多进程资源使用情况是基于共享内存实现的显存管理、还有一些工具函数如host/container pid 转换、共享内存的加锁(lock_shrreg、unlock_shrreg ),虽然看文件名只是做显存限制的。vcuda-controller 里面实现的比较简单,就是在一个文件里面存了下每个进程的 pid,然后每个 API 调用都会调用 nvmlDeviceGetComputeRunningProcesses 去查然后去对 pid 做匹配,为了省时,搜索的时候对 pid 做了二分,总体上开销还是比较高的。HAMI 这里则是通过直接创建一个多进程共享的资源消耗统计文件,进行了缓存,减少 NVML API 调用次数。这个共享文件会被 mmap(内存映射)到每个进程内,也就是 shared_region_t 类型的 region_info.shared_region(ref HAMI源码阅读)。

region_info 的初始化函数 do_init_device_memory_limits

代码语言:c
复制
// src/multiprocess/multiprocess_memory_limit.c
void do_init_device_memory_limits(uint64_t* arr, int len) {
    // 从环境变量获取
    size_t fallback_limit = get_limit_from_env(CUDA_DEVICE_MEMORY_LIMIT);
    int i;
    for (i = 0; i < len; ++i) {
        char env_name[CUDA_DEVICE_MEMORY_LIMIT_KEY_LENGTH] = CUDA_DEVICE_MEMORY_LIMIT;
        char index_name[8];
        snprintf(index_name, 8, "_%d", i);
        strcat(env_name, index_name);
        // 也是从环境变量获取
        size_t cur_limit = get_limit_from_env(env_name);
        if (cur_limit > 0) {
            arr[i] = cur_limit;
        } else if (fallback_limit > 0) {
            arr[i] = fallback_limit;
        } else {
            arr[i] = 0;
        }
    }
}

这里面的环境变量对应的就是 Pod 的 yaml 文件中的 nvidia.com/gpumem。

region_info 的初始化调用是在 initialized 函数中:

代码语言:c
复制
// src/multiprocess/multiprocess_memory_limit.c
void initialized() {
    pthread_mutex_init(&_kernel_mutex, NULL);
    char* _record_kernel_interval_env = getenv("RECORD_KERNEL_INTERVAL");
    if (_record_kernel_interval_env) {
        _record_kernel_interval = atoi(_record_kernel_interval_env);
    }
    try_create_shrreg();
    init_proc_slot_withlock();
}

void try_create_shrreg() {
    LOG_DEBUG("Try create shrreg")
    if (region_info.fd == -1) {
        // use .fd to indicate whether a reinit after fork happen
        // no need to register exit handler after fork
        if (0 != atexit(exit_handler)) {
            LOG_ERROR("Register exit handler failed: %d", errno);
        }
    }

    enable_active_oom_killer = set_active_oom_killer();
    env_utilization_switch = set_env_utilization_switch();
    pthread_atfork(NULL, NULL, child_reinit_flag);

    region_info.pid = getpid();
    region_info.fd = -1;
    region_info.last_kernel_time = time(NULL);

    umask(0);

    char* shr_reg_file = getenv(MULTIPROCESS_SHARED_REGION_CACHE_ENV);
    if (shr_reg_file == NULL) {
        shr_reg_file = MULTIPROCESS_SHARED_REGION_CACHE_DEFAULT;
    }
    // Initialize NVML BEFORE!! open it
    //nvmlInit();

    /* If you need sm modification, do it here */
    /* ... set_sm_scale */

    ...

    //put_device_info();
    if (region->initialized_flag != 
          MULTIPROCESS_SHARED_REGION_MAGIC_FLAG) {
        region->major_version = MAJOR_VERSION;
        region->minor_version = MINOR_VERSION;
        //init_device_info();
        // 初始化 limit 数组,这个是可多进程共享的
        do_init_device_memory_limits(
            region->limit, CUDA_DEVICE_MAX_COUNT);
        do_init_device_sm_limits(
            region->sm_limit,CUDA_DEVICE_MAX_COUNT);
        if (sem_init(&region->sem, 1, 1) != 0) {
            LOG_ERROR("Fail to init sem %s: errno=%d", shr_reg_file, errno);
        }
        __sync_synchronize();
        region->sm_init_flag = 0;
        region->utilization_switch = 1;
        region->recent_kernel = 2;
        region->priority = 1;
        if (getenv(CUDA_TASK_PRIORITY_ENV)!=NULL)
            region->priority = atoi(getenv(CUDA_TASK_PRIORITY_ENV));
        region->initialized_flag = MULTIPROCESS_SHARED_REGION_MAGIC_FLAG;
    } else {
        if (region->major_version != MAJOR_VERSION || 
                region->minor_version != MINOR_VERSION) {
            LOG_ERROR("The current version number %d.%d"
                    " is different from the file's version number %d.%d",
                    MAJOR_VERSION, MINOR_VERSION,
                    region->major_version, region->minor_version);
        }
        uint64_t local_limits[CUDA_DEVICE_MAX_COUNT];
        // 初始化 limit 数组,这个可能是独占的
        do_init_device_memory_limits(local_limits, CUDA_DEVICE_MAX_COUNT);
        int i;
        for (i = 0; i < CUDA_DEVICE_MAX_COUNT; ++i) {
            if (local_limits[i] != region->limit[i]) {
                LOG_ERROR("Limit inconsistency detected for %dth device"
                    ", %lu expected, get %lu", 
                    i, local_limits[i], region->limit[i]);
            }
        }
        do_init_device_sm_limits(local_limits,CUDA_DEVICE_MAX_COUNT);
        for (i = 0; i < CUDA_DEVICE_MAX_COUNT; ++i) {
            if (local_limits[i] != region->sm_limit[i]) {
                LOG_INFO("SM limit inconsistency detected for %dth device"
                    ", %lu expected, get %lu", 
                    i, local_limits[i], region->sm_limit[i]);
            //    exit(1); 
            }
        }
    }
    region->last_kernel_time = region_info.last_kernel_time;
    if (lockf(fd, F_ULOCK, SHARED_REGION_SIZE_MAGIC) != 0) {
        LOG_ERROR("Fail to unlock shrreg %s: errno=%d", shr_reg_file, errno);
    }
    LOG_DEBUG("shrreg created");
}

3.2 CUDA 拦截

cuMemoryAllocatecuMemAlloc_v2 是 CUDA 中用于在 GPU 上分配显存的函数, HAMi 对这两个都做了重新实现:

代码语言:c
复制
// src/cuda/memory.c
CUresult cuMemoryAllocate(CUdeviceptr* dptr, size_t bytesize, size_t* bytesallocated,void* data){
    CUresult res;
    if (bytesallocated!=NULL)
        *bytesallocated = bytesize;
    res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAlloc_v2,dptr,bytesize);
    return res;
}

CUresult cuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize) {
    LOG_INFO("into cuMemAllocing_v2 dptr=%p bytesize=%ld",dptr,bytesize);
    ENSURE_RUNNING();
    // 这里是申请的调用
    CUresult res = allocate_raw(dptr,bytesize);
    if (res!=CUDA_SUCCESS)
        return res;
    LOG_INFO("res=%d, cuMemAlloc_v2 success dptr=%p bytesize=%lu",0,(void *)*dptr,bytesize);
    return CUDA_SUCCESS;
}

allocate_raw 实现如下:

代码语言:c
复制
// src/allocator/allocator.c
int allocate_raw(CUdeviceptr *dptr, size_t size){
    int tmp;
    pthread_mutex_lock(&mutex);
    // 关键
    tmp = add_chunk(dptr,size);
    pthread_mutex_unlock(&mutex);
    return tmp;
}

add_chunk 实现如下:

代码语言:c
复制
// src/allocator/allocator.c
int add_chunk(CUdeviceptr *address,size_t size){
    size_t addr=0;
    size_t allocsize;
    CUresult res = CUDA_SUCCESS;
    CUdevice dev;
    cuCtxGetDevice(&dev);
    // 这里是一个自定义的校验是否超分的函数
    if (oom_check(dev,size))
        return CUDA_ERROR_OUT_OF_MEMORY;
    
    allocated_list_entry *e;
    INIT_ALLOCATED_LIST_ENTRY(e,addr,size);
    if (size <= IPCSIZE)
        res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAlloc_v2,&e->entry->address,size);
    else{
        //size = round_up(size,ALIGN);
        e->entry->length = size;
        res = cuMemoryAllocate(&e->entry->address,size,&e->entry->length,e->entry->allocHandle);
    }
    if (res!=CUDA_SUCCESS){
        LOG_ERROR("cuMemoryAllocate failed res=%d",res);
        return res;
    }
    LIST_ADD(device_overallocated,e);
    //uint64_t t_size;
    *address = e->entry->address;
    allocsize = size;
    cuCtxGetDevice(&dev);
    add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
    return 0;
}

oom_check 实现:

代码语言:c
复制
// src/allocator/allocator.c
int oom_check(const int dev,size_t addon) {
    int count1=0;
    CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetCount,&count1);
    CUdevice d;
    if (dev==-1)
        cuCtxGetDevice(&d);
    else
        d=dev;
    uint64_t limit = get_current_device_memory_limit(d);
    size_t _usage = get_gpu_memory_usage(d);

    if (limit == 0) {
        return 0;
    }

    size_t new_allocated = _usage + addon;
    LOG_INFO("_usage=%lu limit=%lu new_allocated=%lu",_usage,limit,new_allocated);
    // 这里如果新分配内存超过限制会进行一次清理,然后进行递归
    if (new_allocated > limit) {
        LOG_ERROR("Device %d OOM %lu / %lu", d, new_allocated, limit);

        if (rm_quitted_process() > 0)
            return oom_check(dev,addon);
        return 1;
    }
    return 0;
}

猜想:如果想实现超分的特性,是不是只修改这一处就行了?允许超分以及允许超分多少。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 3 GPU 显存限制
    • 3.1 NVML拦截
    • 3.2 CUDA 拦截
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档