From fa5340a68fb44b8da93e3823aa53e21368e29e9f Mon Sep 17 00:00:00 2001 From: iProbe Date: Fri, 26 Jan 2024 16:52:36 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20Others/=E6=A0=B9=E6=8D=AEg?= =?UTF-8?q?pu=E4=BD=BF=E7=94=A8=E7=8E=87=E8=B0=83=E5=BA=A6/=E8=8E=B7?= =?UTF-8?q?=E5=8F=96gpu=E4=BD=BF=E7=94=A8=E7=8E=87=E8=84=9A=E6=9C=AC.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Others/根据gpu使用率调度/获取gpu使用率脚本.md | 55 +------------------ 1 file changed, 2 insertions(+), 53 deletions(-) diff --git a/Others/根据gpu使用率调度/获取gpu使用率脚本.md b/Others/根据gpu使用率调度/获取gpu使用率脚本.md index ef7dffb..6efa1b1 100644 --- a/Others/根据gpu使用率调度/获取gpu使用率脚本.md +++ b/Others/根据gpu使用率调度/获取gpu使用率脚本.md @@ -8,57 +8,6 @@ import redis # 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..] gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']} -def getGpuInfo(): - gpuUsageInfo={} - pynvml.nvmlInit() - deviceCount = pynvml.nvmlDeviceGetCount() - for i in range(deviceCount): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) - gpuUtil = pynvml.nvmlDeviceGetUtilizationRates(handle) - gpuUsageInfo[f"g{i}"] = { - 'free': memInfo.free/1024**2, - 'util': 100-gpuUtil.gpu - } - - pynvml.nvmlShutdown() - return gpuUsageInfo - -def getBanlanceRate(memory,gpu): - return int(memory/100+gpu) - -def zsetData(**kwargs): - data={} - for key in kwargs: - memory=0 - gpu=0 - for d in kwargs[key]: - memory+=gpuUsageInfo[d]['free'] - gpu+=gpuUsageInfo[d]['util'] - banlance=getBanlanceRate(memory,gpu) - data[key]=banlance - return data - - -if __name__ == '__main__': - local="172.20.1.3" - pool=redis.ConnectionPool(host="60.204.148.84",port=6379,password='xm!redis123',db=15) - r=redis.Redis(connection_pool=pool) - pipe=r.pipeline(transaction=True) - gpuUsageInfo=getGpuInfo() - data=zsetData(**gpu) - for key in data: - #print(data[key],f"{local}:{key}") - r.zadd("danceai",{f"{local}:{key}":data[key]}) -(smi) spwang@k8s-worker-003:~$ cat smi-v1.py -#!/usr/bin/env python - -import pynvml -import redis - -# 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..] -gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']} - def getGpuInfo(): gpuUsageInfo={} pynvml.nvmlInit() @@ -76,7 +25,7 @@ def getGpuInfo(): return gpuUsageInfo def getBanlanceRate(memory,gpu): - return int(memory+gpu) + return memory+gpu def zsetData(**kwargs): data={} @@ -86,7 +35,7 @@ def zsetData(**kwargs): for d in kwargs[key]: memory+=gpuUsageInfo[d]['used'] gpu+=gpuUsageInfo[d]['util'] - banlance=getBanlanceRate(memory,gpu) + banlance=int(getBanlanceRate(memory,gpu)/len(kwargs[key])) data[key]=banlance return data