From 840a0184d94cfe4a5a6d5fdb184461794d0d549e Mon Sep 17 00:00:00 2001
From: cln-io <7887972+cln-io@users.noreply.github.com>
Date: Sat, 21 Mar 2026 11:11:20 +0100
Subject: [PATCH 1/3] Clear zombie captures from lacus:ongoing on startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When capture_manager is killed (SIGKILL, OOM, container restart)
while captures are in-flight, the finally blocks in _capture()
never run. UUIDs persist in the lacus:ongoing Redis sorted set
because Valkey writes to disk and reloads on restart.

These zombie UUIDs occupy concurrent_captures slots, blocking
all new captures until clear_dead_captures() eventually ages
them out (max_capture_time * 1.1 seconds later).

Fix: flush lacus:ongoing before the event loop starts. At process
init, self.captures is empty — every UUID in Redis is a zombie.
---
 bin/capture_manager.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bin/capture_manager.py b/bin/capture_manager.py
index e6f0523..a3c07fb 100755
--- a/bin/capture_manager.py
+++ b/bin/capture_manager.py
@@ -25,6 +25,14 @@ def __init__(self, loglevel: int | None=None) -> None:
         self.captures: set[Task[None]] = set()
         self.lacus = Lacus()
 
+    def _clear_ongoing_on_startup(self) -> None:
+        # At process start, self.captures is empty — no task can be running.
+        # Any UUID left in lacus:ongoing is a zombie from a previous crash.
+        zombie_count = self.lacus.redis.zcard('lacus:ongoing')
+        if zombie_count:
+            self.logger.warning(f'Startup cleanup: clearing {zombie_count} zombie capture(s) from lacus:ongoing')
+            self.lacus.redis.delete('lacus:ongoing')
+
     async def clear_dead_captures(self) -> None:
         ongoing = {capture.get_name(): capture for capture in self.captures}
         max_capture_time = get_config('generic', 'max_capture_time')
@@ -82,6 +90,10 @@ def main() -> None:
     loop.add_signal_handler(signal.SIGTERM, lambda: loop.create_task(p.stop_async()))
 
     try:
+        # Flush stale captures before the event loop starts.
+        # Valkey persists across container restarts, so lacus:ongoing
+        # may contain UUIDs from a process that was killed mid-capture.
+        p._clear_ongoing_on_startup()
         loop.run_until_complete(p.run_async(sleep_in_sec=1))
     finally:
         loop.close()

From 83027cef9c312d58723c851d10a41946d438c51c Mon Sep 17 00:00:00 2001
From: cln-io <7887972+cln-io@users.noreply.github.com>
Date: Sat, 21 Mar 2026 11:11:34 +0100
Subject: [PATCH 2/3] Force-clear Redis entry when task cancellation fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a Playwright subprocess hangs, task.cancel() may not
propagate — the browser process ignores the CancelledError.
After 5 failed cancel attempts, the code logged an error but
left the UUID in lacus:ongoing, permanently blocking the slot.

Fix: call clear_capture() after exhausting cancel retries to
free the Redis entry even if the asyncio task is still stuck.
---
 bin/capture_manager.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bin/capture_manager.py b/bin/capture_manager.py
index a3c07fb..9ddecd2 100755
--- a/bin/capture_manager.py
+++ b/bin/capture_manager.py
@@ -55,6 +55,11 @@ async def clear_dead_captures(self) -> None:
                         if not capture.done():
                             self.logger.error(f'{expected_uuid} is not done after canceling, trying {max_cancel} more times.')
                             await asyncio.sleep(1)
+                # All cancel attempts exhausted but the task is still stuck.
+                # Free the Redis slot so new captures aren't blocked.
+                if not capture.done():
+                    self.logger.error(f'{expected_uuid} could not be canceled after 5 attempts, force-clearing from Redis.')
+                    self.lacus.core.clear_capture(expected_uuid, 'Force-cleared: task could not be canceled.')
 
     async def _to_run_forever_async(self) -> None:
 

From 93bb5a3f593daf3ea50754e55f7a649596a659f3 Mon Sep 17 00:00:00 2001
From: cln-io <7887972+cln-io@users.noreply.github.com>
Date: Sat, 21 Mar 2026 11:39:13 +0100
Subject: [PATCH 3/3] Address review feedback on zombie capture fix

- Startup cleanup: iterate with clear_capture() per UUID instead
  of bulk-deleting lacus:ongoing. This stores a proper error result
  for each zombie and cleans up capture_settings, so clients polling
  a UUID across a restart get an explicit failure instead of UNKNOWN.

- Force-clear: also discard the task from self.captures so the
  in-memory slot is freed for max_new_captures computation.
---
 bin/capture_manager.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/bin/capture_manager.py b/bin/capture_manager.py
index 9ddecd2..75946cb 100755
--- a/bin/capture_manager.py
+++ b/bin/capture_manager.py
@@ -28,10 +28,13 @@ def __init__(self, loglevel: int | None=None) -> None:
     def _clear_ongoing_on_startup(self) -> None:
         # At process start, self.captures is empty — no task can be running.
         # Any UUID left in lacus:ongoing is a zombie from a previous crash.
-        zombie_count = self.lacus.redis.zcard('lacus:ongoing')
-        if zombie_count:
-            self.logger.warning(f'Startup cleanup: clearing {zombie_count} zombie capture(s) from lacus:ongoing')
-            self.lacus.redis.delete('lacus:ongoing')
+        # Use clear_capture() per UUID so each gets a proper error result
+        # and capture_settings are cleaned up.
+        ongoing = self.lacus.monitoring.get_ongoing_captures()
+        if ongoing:
+            self.logger.warning(f'Startup cleanup: clearing {len(ongoing)} zombie capture(s) from lacus:ongoing')
+            for uuid, _ in ongoing:
+                self.lacus.core.clear_capture(uuid, 'Cleared on startup: previous process died.')
 
     async def clear_dead_captures(self) -> None:
         ongoing = {capture.get_name(): capture for capture in self.captures}
@@ -60,6 +63,7 @@ async def clear_dead_captures(self) -> None:
                 if not capture.done():
                     self.logger.error(f'{expected_uuid} could not be canceled after 5 attempts, force-clearing from Redis.')
                     self.lacus.core.clear_capture(expected_uuid, 'Force-cleared: task could not be canceled.')
+                    self.captures.discard(capture)
 
     async def _to_run_forever_async(self) -> None: