# import time # # Number of seconds in a day: 24 hours * 60 minutes * 60 seconds # seconds_in_a_day = 24 * 60 * 60 # # Sleep for 100 days # time.sleep(seconds_in_a_day * 500) import subprocess import time import threading import torch from collections import deque def get_gpu_details(gpu_id): """Returns the GPU utilization, used memory, and total memory for a specific GPU.""" cmd = ['nvidia-smi', '--id=' + str(gpu_id), '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,noheader,nounits'] result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) utilization, used_memory, total_memory = result.stdout.strip().split(', ') return int(utilization), int(used_memory), int(total_memory) def matrix_calculation_task(gpu_id, stop_event, task_running): """Performs a GPU-occupying task on the specified GPU.""" torch.cuda.set_device(gpu_id) task_running[gpu_id] = True while not stop_event.is_set(): a = torch.rand(55000, 55000, device='cuda') b = torch.rand(55000, 55000, device='cuda') torch.matmul(a, b) task_running[gpu_id] = False def monitor_and_manage_gpu(gpu_id, stop_event, task_running): """Monitors a GPU and manages the matrix calculation task based on average usage.""" utilization_data = deque(maxlen=30) # Stores the last 30 seconds of utilization data while True: utilization, _, _ = get_gpu_details(gpu_id) utilization_data.append(utilization) if len(utilization_data) == 30: # Every 30 seconds avg_utilization = round(sum(utilization_data) / len(utilization_data), 1) if avg_utilization < 90 and not task_running[gpu_id]: print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is underutilized, starting task.") stop_event.clear() threading.Thread(target=matrix_calculation_task, args=(gpu_id, stop_event, task_running)).start() elif avg_utilization >= 90 and task_running[gpu_id]: print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal, keep running.") else: if task_running[gpu_id]: print(f"Occupying task just starts, and average GPU {gpu_id} ({avg_utilization}%) is increasing, keep monitoring.") else: print(f"No occupying task running, but average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal.") time.sleep(1) # Check every second, but make decisions based on the 30-second average num_gpus = 8 stop_events = [threading.Event() for _ in range(num_gpus)] task_running = [False] * num_gpus # Start monitoring and task management for each GPU for gpu_id in range(1, num_gpus): threading.Thread(target=monitor_and_manage_gpu, args=(gpu_id, stop_events[gpu_id], task_running)).start()