Compare commits

...

2 Commits

Author SHA1 Message Date
02aee01d20 Remove accidental .env inclusion from Prod configuration
All checks were successful
Ruff / ruff (push) Successful in 36s
Test / test (push) Successful in 4m19s
2024-05-11 01:47:57 -04:00
5f3836dc73 Add healthcheck endpoint for django-q2 clusters and scheduled tasks 2024-05-11 01:47:57 -04:00
3 changed files with 87 additions and 2 deletions

View File

@ -223,8 +223,6 @@ class Prod(NonCIBase):
import ldap
from django_auth_ldap.config import LDAPGroupQuery, LDAPSearch, PosixGroupType
DOTENV = BASE_DIR / "settings.dev.env"
DEBUG = False
# LDAP Authentication

View File

@ -25,6 +25,8 @@ from rest_framework import routers
from membershipworks.api import router as membershipworks_router
from paperwork.api import router as paperwork_router
from . import views
router = routers.DefaultRouter()
router.registry.extend(paperwork_router.registry)
router.registry.extend(membershipworks_router.registry)
@ -59,6 +61,7 @@ urlpatterns = [
),
),
path("api-auth/", include("rest_framework.urls")),
path("healthcheck", views.healthcheck),
# path("markdownx/", include("markdownx.urls")),
]

84
cmsmanage/views.py Normal file
View File

@ -0,0 +1,84 @@
import dataclasses
import datetime
from collections.abc import Iterable
from django.http import HttpRequest, HttpResponse
from django.utils import timezone
from django_q.conf import Conf
from django_q.models import Schedule, Task
from django_q.status import Stat
@dataclasses.dataclass
class CheckResult:
message: str
ok: bool
@dataclasses.dataclass
class CheckResultOk(CheckResult):
message: str
ok: bool = True
@dataclasses.dataclass
class CheckResultFailure(CheckResult):
message: str
ok: bool = False
def _check_clusters() -> CheckResult:
clusters = Stat.get_all()
happy_clusters = [
cluster for cluster in clusters if cluster.status in [Conf.IDLE, Conf.WORKING]
]
if not len(clusters):
return CheckResultFailure("No clusters running!")
elif len(happy_clusters) != len(clusters):
return CheckResultFailure(f"{len(happy_clusters)}/{len(clusters)} clusters up")
else:
return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up")
def _check_tasks() -> Iterable[CheckResult]:
now = timezone.now()
for schedule in Schedule.objects.all():
if not schedule.task:
yield CheckResultFailure(f"Scheduled task {schedule} has never run!")
else:
try:
task = Task.objects.get(id=schedule.task)
except Task.DoesNotExist:
yield CheckResultFailure(
f"Scheduled task {schedule}'s last task doesn't exist!"
)
continue
if not task.success:
yield CheckResultFailure(
f"Scheduled task {schedule} failed at {task.started}"
)
elif now - schedule.next_run > datetime.timedelta(hours=2):
yield CheckResultFailure(
f"Scheduled task {schedule} stale, last run at {task.started}"
)
else:
yield CheckResultOk(
f"Scheduled task {schedule} ok, last run at {task.started}"
)
def healthcheck(request: HttpRequest):
checks: list[CheckResult] = [_check_clusters(), *_check_tasks()]
all_ok = all(check.ok for check in checks)
messages = (check.message for check in sorted(checks, key=lambda c: c.ok))
return HttpResponse(
("OK: " if all_ok else "CRITICAL: ") + "\n".join(messages),
content_type="text/plain",
status=200 if all_ok else 500,
)