import dataclasses import datetime from collections.abc import Iterable from django.http import HttpRequest, HttpResponse from django.utils import timezone from django_q.conf import Conf from django_q.models import Schedule, Task from django_q.status import Stat @dataclasses.dataclass class CheckResult: message: str ok: bool @dataclasses.dataclass class CheckResultOk(CheckResult): message: str ok: bool = True @dataclasses.dataclass class CheckResultFailure(CheckResult): message: str ok: bool = False def _check_clusters() -> CheckResult: clusters = Stat.get_all() happy_clusters = [ cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING] ] if not len(clusters): return CheckResultFailure("No clusters running!") elif len(happy_clusters) != len(clusters): return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up") else: return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up") def _check_task(now: datetime.datetime, schedule: Schedule) -> CheckResult: if not schedule.task: return CheckResultFailure(f"Scheduled task {schedule} has never run!") else: try: task = Task.objects.get(id=schedule.task) except Task.DoesNotExist: if now - schedule.next_run > datetime.timedelta(minutes=5): return CheckResultFailure( f"Scheduled task {schedule}'s last task doesn't exist, and is probably not still running!" ) else: return CheckResultOk( f"Schedule {schedule} has no task, but probably running now" ) if not task.success: return CheckResultFailure( f"Scheduled task {schedule} failed at {task.started}" ) elif now - schedule.next_run > datetime.timedelta(hours=2): return CheckResultFailure( f"Scheduled task {schedule} stale, last run at {task.started}" ) else: return CheckResultOk( f"Scheduled task {schedule} ok, last run at {task.started}" ) def _check_tasks() -> Iterable[CheckResult]: now = timezone.now() for schedule in Schedule.objects.all(): yield _check_task(now, schedule) def healthcheck(request: HttpRequest): checks: list[CheckResult] = [_check_clusters(), *_check_tasks()] all_ok = all(check.ok for check in checks) messages = (check.message for check in sorted(checks, key=lambda c: c.ok)) return HttpResponse( ("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages), content_type="text/plain", status=200 if all_ok else 500, )