Docs/Others/根据gpu使用率调度/获取gpu使用率脚本.md at db19836f354d9f257853c209812c10dda6dc8b34

iProbe 44c0449f12 更新 Others/根据gpu使用率调度/获取gpu使用率脚本.md

2024-01-26 16:34:46 +08:00

2.8 KiB

Raw Blame History

lb.py

#!/usr/bin/env python

import pynvml
import redis

# 大模型进程（端口）与gpu绑定关系：port: [gpu_device_id..]
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}

def getGpuInfo():
    gpuUsageInfo={}
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        gpuUtil = pynvml.nvmlDeviceGetUtilizationRates(handle)
        gpuUsageInfo[f"g{i}"] = {
            'free': memInfo.free/1024**2,
            'util': 100-gpuUtil.gpu
        }

    pynvml.nvmlShutdown()
    return gpuUsageInfo

def getBanlanceRate(memory,gpu):
    return int(memory/100+gpu)

def zsetData(**kwargs):
    data={}
    for key in kwargs:
        memory=0
        gpu=0
        for d in kwargs[key]:
            memory+=gpuUsageInfo[d]['free']
            gpu+=gpuUsageInfo[d]['util']
        banlance=getBanlanceRate(memory,gpu)
        data[key]=banlance
    return data


if __name__ == '__main__':
    local="172.20.1.3"
    pool=redis.ConnectionPool(host="60.204.148.84",port=6379,password='xm!redis123',db=15)
    r=redis.Redis(connection_pool=pool)
    pipe=r.pipeline(transaction=True)
    gpuUsageInfo=getGpuInfo()
    data=zsetData(**gpu)
    for key in data:
        #print(data[key],f"{local}:{key}")
        r.zadd("danceai",{f"{local}:{key}":data[key]})
(smi) spwang@k8s-worker-003:~$ cat smi-v1.py
#!/usr/bin/env python

import pynvml
import redis

# 大模型进程（端口）与gpu绑定关系：port: [gpu_device_id..]
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}

def getGpuInfo():
    gpuUsageInfo={}
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        gpuUtil = pynvml.nvmlDeviceGetUtilizationRates(handle)
        gpuUsageInfo[f"g{i}"] = {
            'used': (memInfo.used/memInfo.total)*100,
            'util': gpuUtil.gpu
        }

    pynvml.nvmlShutdown()
    return gpuUsageInfo

def getBanlanceRate(memory,gpu):
    return int(memory+gpu)

def zsetData(**kwargs):
    data={}
    for key in kwargs:
        memory=0
        gpu=0
        for d in kwargs[key]:
            memory+=gpuUsageInfo[d]['used']
            gpu+=gpuUsageInfo[d]['util']
        banlance=getBanlanceRate(memory,gpu)
        data[key]=banlance
    return data


if __name__ == '__main__':
    local="172.20.1.3"
    pool=redis.ConnectionPool(host="60.204.148.84",port=6379,password='xm!redis123',db=15)
    r=redis.Redis(connection_pool=pool)
    pipe=r.pipeline(transaction=True)
    gpuUsageInfo=getGpuInfo()
    data=zsetData(**gpu)
    for key in data:
        #print(data[key],f"{local}:{key}")
        r.zadd("danceai",{f"{local}:{key}":data[key]})

2.8 KiB Raw Blame History Unescape Escape

2.8 KiB

Raw Blame History