cmsmanage/cmsmanage/views.py
Adam Goldsmith 0e486babb7
All checks were successful
Ruff / ruff (push) Successful in 1m14s
Test / test (push) Successful in 4m57s
Use header to expose healthcheck status instead of status code
This is mostly to prevent it sending me emails on every GET of the
page when erroring
2024-08-29 21:35:12 -04:00

93 lines
2.7 KiB
Python

import dataclasses
import datetime
from collections.abc import Iterable
from django.http import HttpRequest, HttpResponse
from django.utils import timezone
from django_q.conf import Conf
from django_q.models import Schedule, Task
from django_q.status import Stat
@dataclasses.dataclass
class CheckResult:
message: str
ok: bool
@dataclasses.dataclass
class CheckResultOk(CheckResult):
message: str
ok: bool = True
@dataclasses.dataclass
class CheckResultFailure(CheckResult):
message: str
ok: bool = False
def _check_clusters() -> CheckResult:
clusters = Stat.get_all()
happy_clusters = [
cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING]
]
if not len(clusters):
return CheckResultFailure("No clusters running!")
elif len(happy_clusters) != len(clusters):
return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up")
else:
return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up")
def _check_task(now: datetime.datetime, schedule: Schedule) -> CheckResult:
if not schedule.task:
return CheckResultFailure(f"Scheduled task {schedule} has never run!")
else:
try:
task = Task.objects.get(id=schedule.task)
except Task.DoesNotExist:
if now - schedule.next_run > datetime.timedelta(minutes=5):
return CheckResultFailure(
f"Scheduled task {schedule}'s last task doesn't exist, and is probably not still running!"
)
else:
return CheckResultOk(
f"Schedule {schedule} has no task, but probably running now"
)
if not task.success:
return CheckResultFailure(
f"Scheduled task {schedule} failed at {task.started}"
)
elif now - schedule.next_run > datetime.timedelta(hours=2):
return CheckResultFailure(
f"Scheduled task {schedule} stale, last run at {task.started}"
)
else:
return CheckResultOk(
f"Scheduled task {schedule} ok, last run at {task.started}"
)
def _check_tasks() -> Iterable[CheckResult]:
now = timezone.now()
for schedule in Schedule.objects.all():
yield _check_task(now, schedule)
def healthcheck(request: HttpRequest):
checks: list[CheckResult] = [_check_clusters(), *_check_tasks()]
all_ok = all(check.ok for check in checks)
messages = (check.message for check in sorted(checks, key=lambda c: c.ok))
return HttpResponse(
("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages),
content_type="text/plain",
headers={"X-CMSManage-Status": "OK"},
)