2.8 KiB
2.8 KiB
lb.py
#!/usr/bin/env python
import pynvml
import redis
# 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..]
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}
def getGpuInfo():
gpuUsageInfo={}
pynvml.nvmlInit()
deviceCount = pynvml.nvmlDeviceGetCount()
for i in range(deviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpuUtil = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpuUsageInfo[f"g{i}"] = {
'free': memInfo.free/1024**2,
'util': 100-gpuUtil.gpu
}
pynvml.nvmlShutdown()
return gpuUsageInfo
def getBanlanceRate(memory,gpu):
return int(memory/100+gpu)
def zsetData(**kwargs):
data={}
for key in kwargs:
memory=0
gpu=0
for d in kwargs[key]:
memory+=gpuUsageInfo[d]['free']
gpu+=gpuUsageInfo[d]['util']
banlance=getBanlanceRate(memory,gpu)
data[key]=banlance
return data
if __name__ == '__main__':
local="172.20.1.3"
pool=redis.ConnectionPool(host="60.204.148.84",port=6379,password='xm!redis123',db=15)
r=redis.Redis(connection_pool=pool)
pipe=r.pipeline(transaction=True)
gpuUsageInfo=getGpuInfo()
data=zsetData(**gpu)
for key in data:
#print(data[key],f"{local}:{key}")
r.zadd("danceai",{f"{local}:{key}":data[key]})
(smi) spwang@k8s-worker-003:~$ cat smi-v1.py
#!/usr/bin/env python
import pynvml
import redis
# 大模型进程(端口)与gpu绑定关系:port: [gpu_device_id..]
gpu={'1001':['g0','g1'],'1002':['g3'],'1003':['g4']}
def getGpuInfo():
gpuUsageInfo={}
pynvml.nvmlInit()
deviceCount = pynvml.nvmlDeviceGetCount()
for i in range(deviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
gpuUtil = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpuUsageInfo[f"g{i}"] = {
'used': (memInfo.used/memInfo.total)*100,
'util': gpuUtil.gpu
}
pynvml.nvmlShutdown()
return gpuUsageInfo
def getBanlanceRate(memory,gpu):
return int(memory+gpu)
def zsetData(**kwargs):
data={}
for key in kwargs:
memory=0
gpu=0
for d in kwargs[key]:
memory+=gpuUsageInfo[d]['used']
gpu+=gpuUsageInfo[d]['util']
banlance=getBanlanceRate(memory,gpu)
data[key]=banlance
return data
if __name__ == '__main__':
local="172.20.1.3"
pool=redis.ConnectionPool(host="60.204.148.84",port=6379,password='xm!redis123',db=15)
r=redis.Redis(connection_pool=pool)
pipe=r.pipeline(transaction=True)
gpuUsageInfo=getGpuInfo()
data=zsetData(**gpu)
for key in data:
#print(data[key],f"{local}:{key}")
r.zadd("danceai",{f"{local}:{key}":data[key]})