diff --git a/cmsmanage/urls.py b/cmsmanage/urls.py index 2e98f6b..9750f47 100644 --- a/cmsmanage/urls.py +++ b/cmsmanage/urls.py @@ -25,6 +25,8 @@ from rest_framework import routers from membershipworks.api import router as membershipworks_router from paperwork.api import router as paperwork_router +from . import views + router = routers.DefaultRouter() router.registry.extend(paperwork_router.registry) router.registry.extend(membershipworks_router.registry) @@ -59,6 +61,7 @@ urlpatterns = [ ), ), path("api-auth/", include("rest_framework.urls")), + path("healthcheck", views.healthcheck), # path("markdownx/", include("markdownx.urls")), ] diff --git a/cmsmanage/views.py b/cmsmanage/views.py new file mode 100644 index 0000000..11a885e --- /dev/null +++ b/cmsmanage/views.py @@ -0,0 +1,86 @@ +import dataclasses +import datetime +from collections.abc import Iterable + +from django.http import HttpRequest, HttpResponse +from django.utils import timezone + +from django_q.conf import Conf +from django_q.models import Schedule, Task +from django_q.status import Stat + + +@dataclasses.dataclass +class CheckResult: + message: str + ok: bool + + +@dataclasses.dataclass +class CheckResultOk(CheckResult): + message: str + ok: bool = True + + +@dataclasses.dataclass +class CheckResultFailure(CheckResult): + message: str + ok: bool = False + + +def _check_clusters() -> CheckResult: + clusters = Stat.get_all() + happy_clusters = [ + cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING] + ] + + Schedule.objects.all() + + if not len(clusters): + return CheckResultFailure("No clusters running!") + elif len(happy_clusters) != len(clusters): + return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up") + else: + return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up") + + +def _check_tasks() -> Iterable[CheckResult]: + now = timezone.now() + for schedule in Schedule.objects.all(): + if not schedule.task: + yield CheckResultFailure(f"Scheduled task {schedule} has never run!") + + else: + try: + task = Task.objects.get(id=schedule.task) + except Task.DoesNotExist: + yield CheckResultFailure( + f"Scheduled task {schedule}'s last task doesn't exist!" + ) + continue + + if not task.success: + yield CheckResultFailure( + f"Scheduled task {schedule} failed at {task.started}" + ) + elif now - schedule.next_run > datetime.timedelta(hours=2): + yield CheckResultFailure( + f"Scheduled task {schedule} stale, last run at {task.started}" + ) + else: + yield CheckResultOk( + f"Scheduled task {schedule} ok, last run at {task.started}" + ) + + +def healthcheck(request: HttpRequest): + checks: list[CheckResult] = [_check_clusters(), *_check_tasks()] + + all_ok = all(check.ok for check in checks) + messages = (check.message for check in sorted(checks, key=lambda c: c.ok)) + + return HttpResponse( + ("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages), + content_type="text/plain", + status=200 if all_ok else 500, + )