2024-05-11 01:00:23 -04:00
|
|
|
import dataclasses
|
|
|
|
import datetime
|
|
|
|
from collections.abc import Iterable
|
|
|
|
|
|
|
|
from django.http import HttpRequest, HttpResponse
|
|
|
|
from django.utils import timezone
|
|
|
|
|
|
|
|
from django_q.conf import Conf
|
|
|
|
from django_q.models import Schedule, Task
|
|
|
|
from django_q.status import Stat
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass
|
|
|
|
class CheckResult:
|
|
|
|
message: str
|
|
|
|
ok: bool
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass
|
|
|
|
class CheckResultOk(CheckResult):
|
|
|
|
message: str
|
|
|
|
ok: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass
|
|
|
|
class CheckResultFailure(CheckResult):
|
|
|
|
message: str
|
|
|
|
ok: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
def _check_clusters() -> CheckResult:
|
|
|
|
clusters = Stat.get_all()
|
|
|
|
happy_clusters = [
|
|
|
|
cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING]
|
|
|
|
]
|
|
|
|
|
|
|
|
if not len(clusters):
|
|
|
|
return CheckResultFailure("No clusters running!")
|
|
|
|
elif len(happy_clusters) != len(clusters):
|
|
|
|
return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up")
|
|
|
|
else:
|
|
|
|
return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up")
|
|
|
|
|
|
|
|
|
2024-05-13 21:43:40 -04:00
|
|
|
def _check_task(now: datetime.datetime, schedule: Schedule) -> CheckResult:
|
|
|
|
if not schedule.task:
|
|
|
|
return CheckResultFailure(f"Scheduled task {schedule} has never run!")
|
2024-05-11 01:00:23 -04:00
|
|
|
|
2024-05-13 21:43:40 -04:00
|
|
|
else:
|
|
|
|
try:
|
|
|
|
task = Task.objects.get(id=schedule.task)
|
|
|
|
except Task.DoesNotExist:
|
|
|
|
if now - schedule.next_run > datetime.timedelta(minutes=5):
|
|
|
|
return CheckResultFailure(
|
|
|
|
f"Scheduled task {schedule}'s last task doesn't exist, and is probably not still running!"
|
2024-05-11 01:00:23 -04:00
|
|
|
)
|
|
|
|
else:
|
2024-05-13 21:43:40 -04:00
|
|
|
return CheckResultOk(
|
|
|
|
f"Schedule {schedule} has no task, but probably running now"
|
2024-05-11 01:00:23 -04:00
|
|
|
)
|
|
|
|
|
2024-05-13 21:43:40 -04:00
|
|
|
if not task.success:
|
|
|
|
return CheckResultFailure(
|
|
|
|
f"Scheduled task {schedule} failed at {task.started}"
|
|
|
|
)
|
|
|
|
elif now - schedule.next_run > datetime.timedelta(hours=2):
|
|
|
|
return CheckResultFailure(
|
|
|
|
f"Scheduled task {schedule} stale, last run at {task.started}"
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
return CheckResultOk(
|
|
|
|
f"Scheduled task {schedule} ok, last run at {task.started}"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _check_tasks() -> Iterable[CheckResult]:
|
|
|
|
now = timezone.now()
|
|
|
|
for schedule in Schedule.objects.all():
|
|
|
|
yield _check_task(now, schedule)
|
|
|
|
|
2024-05-11 01:00:23 -04:00
|
|
|
|
|
|
|
def healthcheck(request: HttpRequest):
|
|
|
|
checks: list[CheckResult] = [_check_clusters(), *_check_tasks()]
|
|
|
|
|
|
|
|
all_ok = all(check.ok for check in checks)
|
|
|
|
messages = (check.message for check in sorted(checks, key=lambda c: c.ok))
|
|
|
|
|
|
|
|
return HttpResponse(
|
|
|
|
("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages),
|
|
|
|
content_type="text/plain",
|
2024-08-29 21:35:12 -04:00
|
|
|
headers={"X-CMSManage-Status": "OK"},
|
2024-05-11 01:00:23 -04:00
|
|
|
)
|