From 4d66f76a02c862bb2f5637cca5238a8ed08aeca7 Mon Sep 17 00:00:00 2001 From: Adam Goldsmith Date: Mon, 13 May 2024 21:43:40 -0400 Subject: [PATCH] healthcheck: Don't report error for tasks that are probably running --- cmsmanage/views.py | 56 ++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/cmsmanage/views.py b/cmsmanage/views.py index a7e5993..9b8c4be 100644 --- a/cmsmanage/views.py +++ b/cmsmanage/views.py @@ -42,33 +42,41 @@ def _check_clusters() -> CheckResult: return CheckResultOk(f"{len(happy_clusters)}/{len(clusters)} clusters up") +def _check_task(now: datetime.datetime, schedule: Schedule) -> CheckResult: + if not schedule.task: + return CheckResultFailure(f"Scheduled task {schedule} has never run!") + + else: + try: + task = Task.objects.get(id=schedule.task) + except Task.DoesNotExist: + if now - schedule.next_run > datetime.timedelta(minutes=5): + return CheckResultFailure( + f"Scheduled task {schedule}'s last task doesn't exist, and is probably not still running!" + ) + else: + return CheckResultOk( + f"Schedule {schedule} has no task, but probably running now" + ) + + if not task.success: + return CheckResultFailure( + f"Scheduled task {schedule} failed at {task.started}" + ) + elif now - schedule.next_run > datetime.timedelta(hours=2): + return CheckResultFailure( + f"Scheduled task {schedule} stale, last run at {task.started}" + ) + else: + return CheckResultOk( + f"Scheduled task {schedule} ok, last run at {task.started}" + ) + + def _check_tasks() -> Iterable[CheckResult]: now = timezone.now() for schedule in Schedule.objects.all(): - if not schedule.task: - yield CheckResultFailure(f"Scheduled task {schedule} has never run!") - - else: - try: - task = Task.objects.get(id=schedule.task) - except Task.DoesNotExist: - yield CheckResultFailure( - f"Scheduled task {schedule}'s last task doesn't exist!" - ) - continue - - if not task.success: - yield CheckResultFailure( - f"Scheduled task {schedule} failed at {task.started}" - ) - elif now - schedule.next_run > datetime.timedelta(hours=2): - yield CheckResultFailure( - f"Scheduled task {schedule} stale, last run at {task.started}" - ) - else: - yield CheckResultOk( - f"Scheduled task {schedule} ok, last run at {task.started}" - ) + yield _check_task(now, schedule) def healthcheck(request: HttpRequest):