diff --git a/manager/manager/components/run_job.py b/manager/manager/components/run_job.py index 465812c..f5301f2 100644 --- a/manager/manager/components/run_job.py +++ b/manager/manager/components/run_job.py @@ -14,7 +14,7 @@ def __init__(self) -> None: def find_gpu(self, partition): gpu = Gpu.objects.filter( - status="WAITING", + status="PENDING", # speed=partition #TEMPORARY DISABLED ).first() if gpu is None: diff --git a/manager/manager/models.py b/manager/manager/models.py index bca942d..1bf4659 100644 --- a/manager/manager/models.py +++ b/manager/manager/models.py @@ -6,12 +6,10 @@ from django.db.models.signals import post_save from django.dispatch import receiver from django.utils import timezone -from django.utils.translation import gettext_lazy as _ class Node(models.Model): class NodeStatus(Enum): - WAITING = "WAITING" PENDING = "PENDING" RESERVED = "RESERVED" UNAVAILABLE = "UNAVAILABLE" @@ -26,7 +24,7 @@ class ConnectionStatus(Enum): status = models.CharField( max_length=50, choices=[(choice.name, choice.value) for choice in NodeStatus], - default=NodeStatus.WAITING.name, + default=NodeStatus.PENDING.name, ) connection_status = models.CharField( max_length=50, @@ -40,16 +38,16 @@ def __str__(self): class Gpu(models.Model): - class GPUStatus(models.TextChoices): - WAITING = "WAITING", _("WAITING") - PENDING = "PENDING", _("PENDING") - RESERVED = "RESERVED", _("RESERVED") - UNAVAILABLE = "UNAVAILABLE", _("UNAVAILABLE") + class GPUStatus(Enum): + RUNNING = "RUNNING" + PENDING = "PENDING" + RESERVED = "RESERVED" # not implemented + UNAVAILABLE = "UNAVAILABLE" # High usage not from job or error - class GPUSpeed(models.TextChoices): - SLOW = "SLOW", _("SLOW") - NORMAL = "NORMAL", _("NORMAL") - FAST = "FAST", _("FAST") + class GPUSpeed(Enum): + SLOW = "SLOW" + NORMAL = "NORMAL" + FAST = "FAST" device_id = models.PositiveSmallIntegerField() uuid = models.UUIDField(unique=True) @@ -65,7 +63,7 @@ class GPUSpeed(models.TextChoices): status = models.CharField( max_length=50, choices=[(choice.name, choice.value) for choice in GPUStatus], - default=GPUStatus.WAITING.name, + default=GPUStatus.PENDING.name, ) last_update = models.DateTimeField(default=timezone.now) @@ -80,7 +78,6 @@ class JobPriority(Enum): HIGH = "HIGH" class JobStatus(Enum): - WAITING = "WAITING" PENDING = "PENDING" FINISHED = "FINISHED" INTERRUPTED = "INTERRUPTED" @@ -89,6 +86,8 @@ class GPUPartition(Enum): SLOW = "SLOW" NORMAL = "NORMAL" FAST = "FAST" + # This next field is only to remove the enum conflict with the GPU speed + UNDEF = "UNDEF" path = models.CharField(max_length=500) port = models.PositiveIntegerField(null=True, blank=True) @@ -110,7 +109,7 @@ class GPUPartition(Enum): status = models.CharField( max_length=50, choices=[(choice.name, choice.value) for choice in JobStatus], - default=JobStatus.WAITING.name, + default=JobStatus.PENDING.name, ) node = models.ForeignKey( Node, on_delete=models.SET_NULL, null=True, blank=True, related_name="node" diff --git a/manager/manager/serializers.py b/manager/manager/serializers.py index 2b68899..e63f466 100644 --- a/manager/manager/serializers.py +++ b/manager/manager/serializers.py @@ -22,16 +22,6 @@ class RefreshNodeSerializer(serializers.Serializer): class GpusSerializer(serializers.ModelSerializer): - speed = serializers.ChoiceField( - choices=Gpu.GPUSpeed.choices, - default=Gpu.GPUSpeed.NORMAL, - help_text="The speed of the GPU.", - ) - status = serializers.ChoiceField( - choices=Gpu.GPUStatus.choices, - default=Gpu.GPUStatus.WAITING, - help_text="The current status of the GPU.", - ) node = serializers.PrimaryKeyRelatedField( queryset=Node.objects.all(), help_text="The associated node ID." ) diff --git a/manager/manager/tests/test_job_management.py b/manager/manager/tests/test_job_management.py index c575fd2..a8a60cc 100644 --- a/manager/manager/tests/test_job_management.py +++ b/manager/manager/tests/test_job_management.py @@ -44,7 +44,7 @@ def test_login_and_create_jobs(self): "priority": random.choice(["LOW", "NORMAL", "HIGH"]), "gpu_partition": random.choice(["SLOW", "NORMAL", "FAST"]), "duration": random.randint(1, 100), - "status": "WAITING", + "status": "PENDING", } response = self.client.post(self.job_url, job_data, format="json") self.assertEqual(response.status_code, status.HTTP_201_CREATED) diff --git a/manager/manager/views.py b/manager/manager/views.py index 81d39a6..9126d2a 100644 --- a/manager/manager/views.py +++ b/manager/manager/views.py @@ -6,9 +6,11 @@ from asgiref.sync import async_to_sync from channels.layers import get_channel_layer +from drf_spectacular.utils import extend_schema, inline_serializer from rest_framework import permissions, status, viewsets from rest_framework.decorators import action from rest_framework.exceptions import ValidationError +from rest_framework.fields import CharField from rest_framework.response import Response from .components.run_job import RunJob @@ -64,7 +66,7 @@ def create(self, request, *_args, **_kwargs): def start(self, _request, pk=None): try: job = self.get_object() - gpu = Gpu.objects.filter(status="WAITING").first() + gpu = Gpu.objects.filter(status="PENDING").first() if not gpu: return Response( {"error": "Gpu unavalible."}, status=status.HTTP_400_BAD_REQUEST @@ -82,8 +84,14 @@ def start(self, _request, pk=None): {"error": "Job not found."}, status=status.HTTP_404_NOT_FOUND ) - @action(detail=True, methods=["get"], url_path="output") - def job_output(self, *_args, **_kwargs): + @extend_schema( + responses=inline_serializer( + name="output", + fields={"output": CharField()}, + ), + ) + @action(detail=True, methods=["get"]) + def output(self, *_args, **_kwargs): try: job = self.get_object() return Response({"output": job.output}, status=status.HTTP_200_OK) diff --git a/node/amuman_node/gpu_monitor.py b/node/amuman_node/gpu_monitor.py index 1592177..b8fdaf5 100644 --- a/node/amuman_node/gpu_monitor.py +++ b/node/amuman_node/gpu_monitor.py @@ -17,7 +17,7 @@ class GPU: uuid: str = field(default="") gpu_util: int = field(default=0) mem_util: int = field(default=0) - status: str = field(default="Waiting") # Use default value from GPUStatus + status: str = field(default="PENDING") # Use default value from GPUStatus is_running_amumax: bool = field(default=False) refresh_time: datetime = field( default_factory=lambda: datetime.now() @@ -63,7 +63,7 @@ def query_nvidia_smi(self, query: str) -> str: def get_gpu_load_status(self, threshold: int = 30) -> str: if self.gpu_util < threshold and self.mem_util < threshold: - status = "WAITING" + status = "PENDING" else: status = "UNAVAILABLE" log.debug(f"GPU {self.device_id} status: {status}") diff --git a/node/amuman_node/job.py b/node/amuman_node/job.py index dbf95f6..5ada9b6 100644 --- a/node/amuman_node/job.py +++ b/node/amuman_node/job.py @@ -1,5 +1,4 @@ import asyncio -import json import logging from dataclasses import asdict, dataclass, field from enum import Enum @@ -15,7 +14,6 @@ class JobPriority(Enum): class JobStatus(Enum): - WAITING = "WAITING" PENDING = "PENDING" FINISHED = "FINISHED" INTERRUPTED = "INTERRUPTED" @@ -43,7 +41,7 @@ class Job: priority: JobPriority = JobPriority.NORMAL gpu_partition: GPUPartition = GPUPartition.NORMAL duration: int = 1 - status: JobStatus = JobStatus.WAITING + status: JobStatus = JobStatus.PENDING output: Optional[str] = None error: Optional[str] = None flags: Optional[str] = None