更新 Others/根据gpu使用率调度/获取gpu使用率脚本.md
This commit is contained in:
parent
7feb11409e
commit
fa5340a68f
1 changed files with 2 additions and 53 deletions
|
@ -8,57 +8,6 @@ import redis
|
||||||
# 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..]
|
# 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..]
|
||||||
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}
|
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}
|
||||||
|
|
||||||
def getGpuInfo():
|
|
||||||
gpuUsageInfo={}
|
|
||||||
pynvml.nvmlInit()
|
|
||||||
deviceCount = pynvml.nvmlDeviceGetCount()
|
|
||||||
for i in range(deviceCount):
|
|
||||||
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
||||||
memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
||||||
gpuUtil = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
||||||
gpuUsageInfo[f"g{i}"] = {
|
|
||||||
'free': memInfo.free/1024**2,
|
|
||||||
'util': 100-gpuUtil.gpu
|
|
||||||
}
|
|
||||||
|
|
||||||
pynvml.nvmlShutdown()
|
|
||||||
return gpuUsageInfo
|
|
||||||
|
|
||||||
def getBanlanceRate(memory,gpu):
|
|
||||||
return int(memory/100+gpu)
|
|
||||||
|
|
||||||
def zsetData(**kwargs):
|
|
||||||
data={}
|
|
||||||
for key in kwargs:
|
|
||||||
memory=0
|
|
||||||
gpu=0
|
|
||||||
for d in kwargs[key]:
|
|
||||||
memory+=gpuUsageInfo[d]['free']
|
|
||||||
gpu+=gpuUsageInfo[d]['util']
|
|
||||||
banlance=getBanlanceRate(memory,gpu)
|
|
||||||
data[key]=banlance
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
local="172.20.1.3"
|
|
||||||
pool=redis.ConnectionPool(host="60.204.148.84",port=6379,password='xm!redis123',db=15)
|
|
||||||
r=redis.Redis(connection_pool=pool)
|
|
||||||
pipe=r.pipeline(transaction=True)
|
|
||||||
gpuUsageInfo=getGpuInfo()
|
|
||||||
data=zsetData(**gpu)
|
|
||||||
for key in data:
|
|
||||||
#print(data[key],f"{local}:{key}")
|
|
||||||
r.zadd("danceai",{f"{local}:{key}":data[key]})
|
|
||||||
(smi) spwang@k8s-worker-003:~$ cat smi-v1.py
|
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
import pynvml
|
|
||||||
import redis
|
|
||||||
|
|
||||||
# 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..]
|
|
||||||
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}
|
|
||||||
|
|
||||||
def getGpuInfo():
|
def getGpuInfo():
|
||||||
gpuUsageInfo={}
|
gpuUsageInfo={}
|
||||||
pynvml.nvmlInit()
|
pynvml.nvmlInit()
|
||||||
|
@ -76,7 +25,7 @@ def getGpuInfo():
|
||||||
return gpuUsageInfo
|
return gpuUsageInfo
|
||||||
|
|
||||||
def getBanlanceRate(memory,gpu):
|
def getBanlanceRate(memory,gpu):
|
||||||
return int(memory+gpu)
|
return memory+gpu
|
||||||
|
|
||||||
def zsetData(**kwargs):
|
def zsetData(**kwargs):
|
||||||
data={}
|
data={}
|
||||||
|
@ -86,7 +35,7 @@ def zsetData(**kwargs):
|
||||||
for d in kwargs[key]:
|
for d in kwargs[key]:
|
||||||
memory+=gpuUsageInfo[d]['used']
|
memory+=gpuUsageInfo[d]['used']
|
||||||
gpu+=gpuUsageInfo[d]['util']
|
gpu+=gpuUsageInfo[d]['util']
|
||||||
banlance=getBanlanceRate(memory,gpu)
|
banlance=int(getBanlanceRate(memory,gpu)/len(kwargs[key]))
|
||||||
data[key]=banlance
|
data[key]=banlance
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue