From 840a0184d94cfe4a5a6d5fdb184461794d0d549e Mon Sep 17 00:00:00 2001 From: cln-io <7887972+cln-io@users.noreply.github.com> Date: Sat, 21 Mar 2026 11:11:20 +0100 Subject: [PATCH 1/3] Clear zombie captures from lacus:ongoing on startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When capture_manager is killed (SIGKILL, OOM, container restart) while captures are in-flight, the finally blocks in _capture() never run. UUIDs persist in the lacus:ongoing Redis sorted set because Valkey writes to disk and reloads on restart. These zombie UUIDs occupy concurrent_captures slots, blocking all new captures until clear_dead_captures() eventually ages them out (max_capture_time * 1.1 seconds later). Fix: flush lacus:ongoing before the event loop starts. At process init, self.captures is empty — every UUID in Redis is a zombie. --- bin/capture_manager.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bin/capture_manager.py b/bin/capture_manager.py index e6f0523..a3c07fb 100755 --- a/bin/capture_manager.py +++ b/bin/capture_manager.py @@ -25,6 +25,14 @@ def __init__(self, loglevel: int | None=None) -> None: self.captures: set[Task[None]] = set() self.lacus = Lacus() + def _clear_ongoing_on_startup(self) -> None: + # At process start, self.captures is empty — no task can be running. + # Any UUID left in lacus:ongoing is a zombie from a previous crash. + zombie_count = self.lacus.redis.zcard('lacus:ongoing') + if zombie_count: + self.logger.warning(f'Startup cleanup: clearing {zombie_count} zombie capture(s) from lacus:ongoing') + self.lacus.redis.delete('lacus:ongoing') + async def clear_dead_captures(self) -> None: ongoing = {capture.get_name(): capture for capture in self.captures} max_capture_time = get_config('generic', 'max_capture_time') @@ -82,6 +90,10 @@ def main() -> None: loop.add_signal_handler(signal.SIGTERM, lambda: loop.create_task(p.stop_async())) try: + # Flush stale captures before the event loop starts. + # Valkey persists across container restarts, so lacus:ongoing + # may contain UUIDs from a process that was killed mid-capture. + p._clear_ongoing_on_startup() loop.run_until_complete(p.run_async(sleep_in_sec=1)) finally: loop.close() From 83027cef9c312d58723c851d10a41946d438c51c Mon Sep 17 00:00:00 2001 From: cln-io <7887972+cln-io@users.noreply.github.com> Date: Sat, 21 Mar 2026 11:11:34 +0100 Subject: [PATCH 2/3] Force-clear Redis entry when task cancellation fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a Playwright subprocess hangs, task.cancel() may not propagate — the browser process ignores the CancelledError. After 5 failed cancel attempts, the code logged an error but left the UUID in lacus:ongoing, permanently blocking the slot. Fix: call clear_capture() after exhausting cancel retries to free the Redis entry even if the asyncio task is still stuck. --- bin/capture_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/capture_manager.py b/bin/capture_manager.py index a3c07fb..9ddecd2 100755 --- a/bin/capture_manager.py +++ b/bin/capture_manager.py @@ -55,6 +55,11 @@ async def clear_dead_captures(self) -> None: if not capture.done(): self.logger.error(f'{expected_uuid} is not done after canceling, trying {max_cancel} more times.') await asyncio.sleep(1) + # All cancel attempts exhausted but the task is still stuck. + # Free the Redis slot so new captures aren't blocked. + if not capture.done(): + self.logger.error(f'{expected_uuid} could not be canceled after 5 attempts, force-clearing from Redis.') + self.lacus.core.clear_capture(expected_uuid, 'Force-cleared: task could not be canceled.') async def _to_run_forever_async(self) -> None: From 93bb5a3f593daf3ea50754e55f7a649596a659f3 Mon Sep 17 00:00:00 2001 From: cln-io <7887972+cln-io@users.noreply.github.com> Date: Sat, 21 Mar 2026 11:39:13 +0100 Subject: [PATCH 3/3] Address review feedback on zombie capture fix - Startup cleanup: iterate with clear_capture() per UUID instead of bulk-deleting lacus:ongoing. This stores a proper error result for each zombie and cleans up capture_settings, so clients polling a UUID across a restart get an explicit failure instead of UNKNOWN. - Force-clear: also discard the task from self.captures so the in-memory slot is freed for max_new_captures computation. --- bin/capture_manager.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/capture_manager.py b/bin/capture_manager.py index 9ddecd2..75946cb 100755 --- a/bin/capture_manager.py +++ b/bin/capture_manager.py @@ -28,10 +28,13 @@ def __init__(self, loglevel: int | None=None) -> None: def _clear_ongoing_on_startup(self) -> None: # At process start, self.captures is empty — no task can be running. # Any UUID left in lacus:ongoing is a zombie from a previous crash. - zombie_count = self.lacus.redis.zcard('lacus:ongoing') - if zombie_count: - self.logger.warning(f'Startup cleanup: clearing {zombie_count} zombie capture(s) from lacus:ongoing') - self.lacus.redis.delete('lacus:ongoing') + # Use clear_capture() per UUID so each gets a proper error result + # and capture_settings are cleaned up. + ongoing = self.lacus.monitoring.get_ongoing_captures() + if ongoing: + self.logger.warning(f'Startup cleanup: clearing {len(ongoing)} zombie capture(s) from lacus:ongoing') + for uuid, _ in ongoing: + self.lacus.core.clear_capture(uuid, 'Cleared on startup: previous process died.') async def clear_dead_captures(self) -> None: ongoing = {capture.get_name(): capture for capture in self.captures} @@ -60,6 +63,7 @@ async def clear_dead_captures(self) -> None: if not capture.done(): self.logger.error(f'{expected_uuid} could not be canceled after 5 attempts, force-clearing from Redis.') self.lacus.core.clear_capture(expected_uuid, 'Force-cleared: task could not be canceled.') + self.captures.discard(capture) async def _to_run_forever_async(self) -> None: