Skip to content

Commit

Permalink
Replace WAITING with PENDING
Browse files Browse the repository at this point in the history
  • Loading branch information
MathieuMoalic committed Mar 4, 2024
1 parent 11f26ef commit e45a2e1
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 35 deletions.
2 changes: 1 addition & 1 deletion manager/manager/components/run_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self) -> None:

def find_gpu(self, partition):
gpu = Gpu.objects.filter(
status="WAITING",
status="PENDING",
# speed=partition #TEMPORARY DISABLED
).first()
if gpu is None:
Expand Down
29 changes: 14 additions & 15 deletions manager/manager/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@
from django.db.models.signals import post_save
from django.dispatch import receiver
from django.utils import timezone
from django.utils.translation import gettext_lazy as _


class Node(models.Model):
class NodeStatus(Enum):
WAITING = "WAITING"
PENDING = "PENDING"
RESERVED = "RESERVED"
UNAVAILABLE = "UNAVAILABLE"
Expand All @@ -26,7 +24,7 @@ class ConnectionStatus(Enum):
status = models.CharField(
max_length=50,
choices=[(choice.name, choice.value) for choice in NodeStatus],
default=NodeStatus.WAITING.name,
default=NodeStatus.PENDING.name,
)
connection_status = models.CharField(
max_length=50,
Expand All @@ -40,16 +38,16 @@ def __str__(self):


class Gpu(models.Model):
class GPUStatus(models.TextChoices):
WAITING = "WAITING", _("WAITING")
PENDING = "PENDING", _("PENDING")
RESERVED = "RESERVED", _("RESERVED")
UNAVAILABLE = "UNAVAILABLE", _("UNAVAILABLE")
class GPUStatus(Enum):
RUNNING = "RUNNING"
PENDING = "PENDING"
RESERVED = "RESERVED" # not implemented
UNAVAILABLE = "UNAVAILABLE" # High usage not from job or error

class GPUSpeed(models.TextChoices):
SLOW = "SLOW", _("SLOW")
NORMAL = "NORMAL", _("NORMAL")
FAST = "FAST", _("FAST")
class GPUSpeed(Enum):
SLOW = "SLOW"
NORMAL = "NORMAL"
FAST = "FAST"

device_id = models.PositiveSmallIntegerField()
uuid = models.UUIDField(unique=True)
Expand All @@ -65,7 +63,7 @@ class GPUSpeed(models.TextChoices):
status = models.CharField(
max_length=50,
choices=[(choice.name, choice.value) for choice in GPUStatus],
default=GPUStatus.WAITING.name,
default=GPUStatus.PENDING.name,
)
last_update = models.DateTimeField(default=timezone.now)

Expand All @@ -80,7 +78,6 @@ class JobPriority(Enum):
HIGH = "HIGH"

class JobStatus(Enum):
WAITING = "WAITING"
PENDING = "PENDING"
FINISHED = "FINISHED"
INTERRUPTED = "INTERRUPTED"
Expand All @@ -89,6 +86,8 @@ class GPUPartition(Enum):
SLOW = "SLOW"
NORMAL = "NORMAL"
FAST = "FAST"
# This next field is only to remove the enum conflict with the GPU speed
UNDEF = "UNDEF"

path = models.CharField(max_length=500)
port = models.PositiveIntegerField(null=True, blank=True)
Expand All @@ -110,7 +109,7 @@ class GPUPartition(Enum):
status = models.CharField(
max_length=50,
choices=[(choice.name, choice.value) for choice in JobStatus],
default=JobStatus.WAITING.name,
default=JobStatus.PENDING.name,
)
node = models.ForeignKey(
Node, on_delete=models.SET_NULL, null=True, blank=True, related_name="node"
Expand Down
10 changes: 0 additions & 10 deletions manager/manager/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,6 @@ class RefreshNodeSerializer(serializers.Serializer):


class GpusSerializer(serializers.ModelSerializer):
speed = serializers.ChoiceField(
choices=Gpu.GPUSpeed.choices,
default=Gpu.GPUSpeed.NORMAL,
help_text="The speed of the GPU.",
)
status = serializers.ChoiceField(
choices=Gpu.GPUStatus.choices,
default=Gpu.GPUStatus.WAITING,
help_text="The current status of the GPU.",
)
node = serializers.PrimaryKeyRelatedField(
queryset=Node.objects.all(), help_text="The associated node ID."
)
Expand Down
2 changes: 1 addition & 1 deletion manager/manager/tests/test_job_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_login_and_create_jobs(self):
"priority": random.choice(["LOW", "NORMAL", "HIGH"]),
"gpu_partition": random.choice(["SLOW", "NORMAL", "FAST"]),
"duration": random.randint(1, 100),
"status": "WAITING",
"status": "PENDING",
}
response = self.client.post(self.job_url, job_data, format="json")
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
14 changes: 11 additions & 3 deletions manager/manager/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from drf_spectacular.utils import extend_schema, inline_serializer
from rest_framework import permissions, status, viewsets
from rest_framework.decorators import action
from rest_framework.exceptions import ValidationError
from rest_framework.fields import CharField
from rest_framework.response import Response

from .components.run_job import RunJob
Expand Down Expand Up @@ -64,7 +66,7 @@ def create(self, request, *_args, **_kwargs):
def start(self, _request, pk=None):
try:
job = self.get_object()
gpu = Gpu.objects.filter(status="WAITING").first()
gpu = Gpu.objects.filter(status="PENDING").first()
if not gpu:
return Response(
{"error": "Gpu unavalible."}, status=status.HTTP_400_BAD_REQUEST
Expand All @@ -82,8 +84,14 @@ def start(self, _request, pk=None):
{"error": "Job not found."}, status=status.HTTP_404_NOT_FOUND
)

@action(detail=True, methods=["get"], url_path="output")
def job_output(self, *_args, **_kwargs):
@extend_schema(
responses=inline_serializer(
name="output",
fields={"output": CharField()},
),
)
@action(detail=True, methods=["get"])
def output(self, *_args, **_kwargs):
try:
job = self.get_object()
return Response({"output": job.output}, status=status.HTTP_200_OK)
Expand Down
4 changes: 2 additions & 2 deletions node/amuman_node/gpu_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class GPU:
uuid: str = field(default="")
gpu_util: int = field(default=0)
mem_util: int = field(default=0)
status: str = field(default="Waiting") # Use default value from GPUStatus
status: str = field(default="PENDING") # Use default value from GPUStatus
is_running_amumax: bool = field(default=False)
refresh_time: datetime = field(
default_factory=lambda: datetime.now()
Expand Down Expand Up @@ -63,7 +63,7 @@ def query_nvidia_smi(self, query: str) -> str:

def get_gpu_load_status(self, threshold: int = 30) -> str:
if self.gpu_util < threshold and self.mem_util < threshold:
status = "WAITING"
status = "PENDING"
else:
status = "UNAVAILABLE"
log.debug(f"GPU {self.device_id} status: {status}")
Expand Down
4 changes: 1 addition & 3 deletions node/amuman_node/job.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import json
import logging
from dataclasses import asdict, dataclass, field
from enum import Enum
Expand All @@ -15,7 +14,6 @@ class JobPriority(Enum):


class JobStatus(Enum):
WAITING = "WAITING"
PENDING = "PENDING"
FINISHED = "FINISHED"
INTERRUPTED = "INTERRUPTED"
Expand Down Expand Up @@ -43,7 +41,7 @@ class Job:
priority: JobPriority = JobPriority.NORMAL
gpu_partition: GPUPartition = GPUPartition.NORMAL
duration: int = 1
status: JobStatus = JobStatus.WAITING
status: JobStatus = JobStatus.PENDING
output: Optional[str] = None
error: Optional[str] = None
flags: Optional[str] = None
Expand Down

0 comments on commit e45a2e1

Please sign in to comment.