simulation_runner: added failed job cleanup

2026-04-25 16:36:22 +02:00 · 2025-02-20 18:51:49 +05:30 · 2025-02-20 18:51:49 +05:30 · 7bc3203ed2
commit 7bc3203ed2
parent ee20f7c6e3
4 changed files with 113 additions and 29 deletions
--- a/apps/simulation_runner/service.py
+++ b/apps/simulation_runner/service.py
@ -1,7 +1,7 @@
 import asyncio
 import logging
 from typing import List
-from db import get_pending_simulation_run, get_scenarios_for_run, set_simulation_run_to_completed, get_api_key
+from db import get_pending_simulation_run, get_scenarios_for_run, set_simulation_run_to_completed, get_api_key, mark_stale_jobs_as_failed, update_simulation_run_heartbeat
 from scenario_types import SimulationRun, Scenario
 from simulation import simulate_scenarios
 logging.basicConfig(level=logging.INFO)
@ -15,14 +15,16 @@ class JobService:
        """
        Periodically checks for new jobs in MongoDB and processes them.
        """
+
+        # Start the stale-job check in the background
+        asyncio.create_task(self.fail_stale_jobs_loop())
+
        iterations = 0
        while True:
            job = get_pending_simulation_run()
            if job:
                logging.info(f"Found new job: {job}. Processing...")
                asyncio.create_task(self.process_job(job))
-            else:
-                logging.info("No new jobs found. Checking again in 5 seconds...")

            iterations += 1
            if max_iterations is not None and iterations >= max_iterations:
@ -34,18 +36,52 @@ class JobService:
        """
        Calls the simulation function and updates job status upon completion.
        """
+
        async with self.semaphore:
-            scenarios = get_scenarios_for_run(job)
-            if not scenarios or len(scenarios) == 0:
-                logging.info(f"No scenarios found for job {job.id}")
-                return
+            # Start heartbeat in background
+            stop_heartbeat_event = asyncio.Event()
+            heartbeat_task = asyncio.create_task(self.heartbeat_loop(job.id, stop_heartbeat_event))

-            api_key = get_api_key(job.projectId)
-            result = await simulate_scenarios(scenarios, job.id, job.workflowId, api_key)
+            try:
+                scenarios = get_scenarios_for_run(job)
+                if not scenarios or len(scenarios) == 0:
+                    logging.info(f"No scenarios found for job {job.id}")
+                    return
+
+                api_key = get_api_key(job.projectId)
+                result = await simulate_scenarios(scenarios, job.id, job.workflowId, api_key)


-            set_simulation_run_to_completed(job, result)
-            logging.info(f"Job {job.id} completed.")
+                set_simulation_run_to_completed(job, result)
+                logging.info(f"Job {job.id} completed.")
+            except Exception as exc:
+                logging.error(f"Job {job.id} failed: {exc}")
+            finally:
+                stop_heartbeat_event.set()
+                await heartbeat_task
+
+    async def fail_stale_jobs_loop(self):
+        """
+        Periodically checks for stale jobs that haven't received a heartbeat in over 5 minutes,
+        and marks them as 'failed'.
+        """
+        while True:
+            count = mark_stale_jobs_as_failed()
+            if count > 0:
+                logging.warning(f"Marked {count} stale jobs as failed.")
+            await asyncio.sleep(60)  # Check every 60 seconds
+
+    async def heartbeat_loop(self, job_id: str, stop_event: asyncio.Event):
+        """
+        Periodically updates 'last_heartbeat' for the given job until 'stop_event' is set.
+        """
+
+        try:
+            while not stop_event.is_set():
+                update_simulation_run_heartbeat(job_id)
+                await asyncio.sleep(10)  # Heartbeat interval in seconds
+        except asyncio.CancelledError:
+            pass

    def start(self):
        """