Add healthcheck endpoint for django-q2 clusters and scheduled tasks
This commit is contained in:
parent
72cf436e50
commit
5f3836dc73
@ -25,6 +25,8 @@ from rest_framework import routers
|
||||
from membershipworks.api import router as membershipworks_router
|
||||
from paperwork.api import router as paperwork_router
|
||||
|
||||
from . import views
|
||||
|
||||
router = routers.DefaultRouter()
|
||||
router.registry.extend(paperwork_router.registry)
|
||||
router.registry.extend(membershipworks_router.registry)
|
||||
@ -59,6 +61,7 @@ urlpatterns = [
|
||||
),
|
||||
),
|
||||
path("api-auth/", include("rest_framework.urls")),
|
||||
path("healthcheck", views.healthcheck),
|
||||
# path("markdownx/", include("markdownx.urls")),
|
||||
]
|
||||
|
||||
|
84
cmsmanage/views.py
Normal file
84
cmsmanage/views.py
Normal file
@ -0,0 +1,84 @@
|
||||
import dataclasses
|
||||
import datetime
|
||||
from collections.abc import Iterable
|
||||
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.utils import timezone
|
||||
|
||||
from django_q.conf import Conf
|
||||
from django_q.models import Schedule, Task
|
||||
from django_q.status import Stat
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CheckResult:
|
||||
message: str
|
||||
ok: bool
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CheckResultOk(CheckResult):
|
||||
message: str
|
||||
ok: bool = True
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CheckResultFailure(CheckResult):
|
||||
message: str
|
||||
ok: bool = False
|
||||
|
||||
|
||||
def _check_clusters() -> CheckResult:
|
||||
clusters = Stat.get_all()
|
||||
happy_clusters = [
|
||||
cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING]
|
||||
]
|
||||
|
||||
if not len(clusters):
|
||||
return CheckResultFailure("No clusters running!")
|
||||
elif len(happy_clusters) != len(clusters):
|
||||
return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up")
|
||||
else:
|
||||
return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up")
|
||||
|
||||
|
||||
def _check_tasks() -> Iterable[CheckResult]:
|
||||
now = timezone.now()
|
||||
for schedule in Schedule.objects.all():
|
||||
if not schedule.task:
|
||||
yield CheckResultFailure(f"Scheduled task {schedule} has never run!")
|
||||
|
||||
else:
|
||||
try:
|
||||
task = Task.objects.get(id=schedule.task)
|
||||
except Task.DoesNotExist:
|
||||
yield CheckResultFailure(
|
||||
f"Scheduled task {schedule}'s last task doesn't exist!"
|
||||
)
|
||||
continue
|
||||
|
||||
if not task.success:
|
||||
yield CheckResultFailure(
|
||||
f"Scheduled task {schedule} failed at {task.started}"
|
||||
)
|
||||
elif now - schedule.next_run > datetime.timedelta(hours=2):
|
||||
yield CheckResultFailure(
|
||||
f"Scheduled task {schedule} stale, last run at {task.started}"
|
||||
)
|
||||
else:
|
||||
yield CheckResultOk(
|
||||
f"Scheduled task {schedule} ok, last run at {task.started}"
|
||||
)
|
||||
|
||||
|
||||
def healthcheck(request: HttpRequest):
|
||||
checks: list[CheckResult] = [_check_clusters(), *_check_tasks()]
|
||||
|
||||
all_ok = all(check.ok for check in checks)
|
||||
messages = (check.message for check in sorted(checks, key=lambda c: c.ok))
|
||||
|
||||
return HttpResponse(
|
||||
("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages),
|
||||
content_type="text/plain",
|
||||
status=200 if all_ok else 500,
|
||||
)
|
Loading…
Reference in New Issue
Block a user