Adam Goldsmith
0e486babb7
This is mostly to prevent it sending me emails on every GET of the page when erroring
93 lines
2.7 KiB
Python
93 lines
2.7 KiB
Python
import dataclasses
|
|
import datetime
|
|
from collections.abc import Iterable
|
|
|
|
from django.http import HttpRequest, HttpResponse
|
|
from django.utils import timezone
|
|
|
|
from django_q.conf import Conf
|
|
from django_q.models import Schedule, Task
|
|
from django_q.status import Stat
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class CheckResult:
|
|
message: str
|
|
ok: bool
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class CheckResultOk(CheckResult):
|
|
message: str
|
|
ok: bool = True
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class CheckResultFailure(CheckResult):
|
|
message: str
|
|
ok: bool = False
|
|
|
|
|
|
def _check_clusters() -> CheckResult:
|
|
clusters = Stat.get_all()
|
|
happy_clusters = [
|
|
cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING]
|
|
]
|
|
|
|
if not len(clusters):
|
|
return CheckResultFailure("No clusters running!")
|
|
elif len(happy_clusters) != len(clusters):
|
|
return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up")
|
|
else:
|
|
return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up")
|
|
|
|
|
|
def _check_task(now: datetime.datetime, schedule: Schedule) -> CheckResult:
|
|
if not schedule.task:
|
|
return CheckResultFailure(f"Scheduled task {schedule} has never run!")
|
|
|
|
else:
|
|
try:
|
|
task = Task.objects.get(id=schedule.task)
|
|
except Task.DoesNotExist:
|
|
if now - schedule.next_run > datetime.timedelta(minutes=5):
|
|
return CheckResultFailure(
|
|
f"Scheduled task {schedule}'s last task doesn't exist, and is probably not still running!"
|
|
)
|
|
else:
|
|
return CheckResultOk(
|
|
f"Schedule {schedule} has no task, but probably running now"
|
|
)
|
|
|
|
if not task.success:
|
|
return CheckResultFailure(
|
|
f"Scheduled task {schedule} failed at {task.started}"
|
|
)
|
|
elif now - schedule.next_run > datetime.timedelta(hours=2):
|
|
return CheckResultFailure(
|
|
f"Scheduled task {schedule} stale, last run at {task.started}"
|
|
)
|
|
else:
|
|
return CheckResultOk(
|
|
f"Scheduled task {schedule} ok, last run at {task.started}"
|
|
)
|
|
|
|
|
|
def _check_tasks() -> Iterable[CheckResult]:
|
|
now = timezone.now()
|
|
for schedule in Schedule.objects.all():
|
|
yield _check_task(now, schedule)
|
|
|
|
|
|
def healthcheck(request: HttpRequest):
|
|
checks: list[CheckResult] = [_check_clusters(), *_check_tasks()]
|
|
|
|
all_ok = all(check.ok for check in checks)
|
|
messages = (check.message for check in sorted(checks, key=lambda c: c.ok))
|
|
|
|
return HttpResponse(
|
|
("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages),
|
|
content_type="text/plain",
|
|
headers={"X-CMSManage-Status": "OK"},
|
|
)
|