From b9dd2d419c89f39972d03843e7e48fd93f5cc57e Mon Sep 17 00:00:00 2001 From: Ghost Scripter Date: Thu, 4 Jun 2026 15:15:22 +0530 Subject: [PATCH] fix(cron): halt agent-job retry loop on backend session-expired (TAURI-RUST-N) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the user's OpenHuman backend JWT lapses, the inference layer's `api_error` already publishes `DomainEvent::SessionExpired` for the 401 and intentionally skips its own `report_error` call. But the cron retry loop in `execute_job_with_retry` doesn't consult that signal — it sleeps with exponential backoff, retries the same job N times (every attempt hitting the same global 401), then unconditionally calls `report_error` with `failure=retries_exhausted`. That generated TAURI-RUST-N: 7,038 events / 5 users, all `domain=cron operation=agent_job` from a `morning_briefing` agent grinding through retries after one lapse. Fix mirrors PR #3334's halt-on-first-occurrence pattern, but at the cron retry layer instead of the agent tool loop: - New `is_session_expired_failure` predicate consults the existing `core::observability::is_session_expired_message` classifier (the `OpenHuman API error (401` + `"error":"Invalid token"` conjunction was already added for OPENHUMAN-TAURI-4P0). Matches on `last_agent_error` first (carries the raw provider wire chain), falls back to `last_output` for defense-in-depth. - `execute_job_with_retry` breaks out of the loop on the first occurrence — no further attempts, no `report_error` call. The inference layer's `SessionExpired` publish already drives the credentials/scheduler-gate handshake; retries can't recover until the user re-auths. - Restricted to `JobType::Agent`: shell jobs that happen to echo a 401-shaped string keep their existing retry semantics (no `SessionExpired` publish from shell stdout, no reason to flip the gate). Five focused unit tests pin the predicate behaviour: the 401 wire shape trips the halt via `last_agent_error` or `last_output`; the canned user message, ordinary provider errors (incl. third-party BYO-key 401s), and shell-job invocations all stay on the retry path. Sentry-Issue: TAURI-RUST-N --- src/openhuman/cron/scheduler.rs | 66 +++++++++++++++++++++- src/openhuman/cron/scheduler_tests.rs | 81 +++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) diff --git a/src/openhuman/cron/scheduler.rs b/src/openhuman/cron/scheduler.rs index 8202dd0d19..bc6ffba5f1 100644 --- a/src/openhuman/cron/scheduler.rs +++ b/src/openhuman/cron/scheduler.rs @@ -192,6 +192,55 @@ pub async fn execute_job_now(config: &Config, job: &CronJob) -> (bool, String) { execute_job_with_retry(config, &security, job).await } +/// Did this failed agent-job attempt hit the backend session-expired state? +/// +/// When the OpenHuman backend returns 401 because the user's app JWT has +/// lapsed, [`inference::provider::ops::api_error`] already publishes +/// [`crate::core::event_bus::DomainEvent::SessionExpired`] (via +/// `publish_backend_session_expired`) and the credentials subscriber clears +/// the stored session + flips the scheduler-gate `signed_out` override. The +/// gate then halts downstream LLM work until the user re-auths. +/// +/// The cron retry loop pre-dates that gate handshake: it sleeps with +/// exponential backoff and retries the same job N times, every attempt +/// hitting the same global 401, then calls `report_error` with +/// `failure=retries_exhausted`. That generated TAURI-RUST-N (7,038 events / +/// 5 users): a cron-fired `morning_briefing` agent grinding through retries +/// after a single JWT lapse, every retries-exhausted capture pointing at a +/// problem the user can only fix from the UI. +/// +/// The right move is the same halt-on-first-occurrence pattern as +/// `agent::harness::tool_loop::BACKEND_USER_STATE_MARKER` (#3334): the +/// condition is global and retries can't recover it, so we stop after the +/// first attempt. Skipping the `report_error` call too is correct because +/// the existing classifier +/// [`crate::core::observability::is_session_expired_message`] already +/// considers this expected user state (`observability.rs` — anchored on +/// `OpenHuman API error (401` + `"error":"Invalid token"`). +/// +/// We match on `last_agent_error` first because cron's `run_agent_job` +/// routes the raw anyhow chain there (containing the provider's wire +/// message), while `last_output` only carries the canned user-facing +/// notification (`AGENT_JOB_USER_FAILURE_MESSAGE` / per-variant copy). For +/// the canned-message branch we still fall back to `last_output` so a +/// future code path that surfaces the raw error there isn't a silent miss. +/// +/// Restricted to `JobType::Agent`: shell jobs that happen to echo a +/// 401-shaped string don't go through the inference layer's +/// `SessionExpired` publish, so halting them based on stdout would skip +/// retries the operator may want. +fn is_session_expired_failure( + job_type: &JobType, + last_agent_error: Option<&str>, + last_output: &str, +) -> bool { + if !matches!(job_type, JobType::Agent) { + return false; + } + let signal = last_agent_error.unwrap_or(last_output); + crate::core::observability::is_session_expired_message(signal) +} + async fn execute_job_with_retry( config: &Config, security: &SecurityPolicy, @@ -201,6 +250,7 @@ async fn execute_job_with_retry( let mut last_agent_error: Option = None; let retries = config.reliability.scheduler_retries; let mut backoff_ms = config.reliability.provider_backoff_ms.max(200); + let mut session_expired = false; for attempt in 0..=retries { let (success, output, agent_error) = match job.job_type { @@ -224,6 +274,20 @@ async fn execute_job_with_retry( return (false, last_output); } + if is_session_expired_failure( + &job.job_type, + last_agent_error.as_deref(), + last_output.as_str(), + ) { + // Halt on the first occurrence — the inference layer already + // published `SessionExpired`, retries cannot recover until the + // user re-auths, and the classifier considers this expected + // user state (TAURI-RUST-N). See `is_session_expired_failure` + // for the full rationale. + session_expired = true; + break; + } + if attempt < retries { let jitter_ms = u64::from(Utc::now().timestamp_subsec_millis() % 250); time::sleep(Duration::from_millis(backoff_ms + jitter_ms)).await; @@ -231,7 +295,7 @@ async fn execute_job_with_retry( } } - if matches!(job.job_type, JobType::Agent) { + if matches!(job.job_type, JobType::Agent) && !session_expired { let report_message = last_agent_error .as_deref() .unwrap_or_else(|| last_output.as_str()); diff --git a/src/openhuman/cron/scheduler_tests.rs b/src/openhuman/cron/scheduler_tests.rs index 31e82d8a13..f547d5ac5e 100644 --- a/src/openhuman/cron/scheduler_tests.rs +++ b/src/openhuman/cron/scheduler_tests.rs @@ -302,6 +302,87 @@ async fn execute_job_with_retry_exhausts_attempts() { assert!(output.contains("always_missing_for_retry_test")); } +// TAURI-RUST-N — backend 401 ("Invalid token") leaks from a cron-fired agent +// job through `last_agent_error` and the existing classifier in +// `core::observability::is_session_expired_message` matches it (the +// `OpenHuman API error (401` + `"error":"Invalid token"` conjunction was added +// for OPENHUMAN-TAURI-4P0). `is_session_expired_failure` MUST consult that +// classifier so the cron retry loop halts on the first occurrence instead of +// retrying N times and reporting `failure=retries_exhausted` to Sentry. +#[test] +fn is_session_expired_failure_matches_openhuman_backend_401_in_agent_error() { + let wire = + r#"OpenHuman API error (401 Unauthorized): {"success":false,"error":"Invalid token"}"#; + assert!( + is_session_expired_failure(&JobType::Agent, Some(wire), AGENT_JOB_USER_FAILURE_MESSAGE), + "raw agent error carrying the 401 wire shape must trip the halt" + ); +} + +// Defense-in-depth: if a future code path ever surfaces the raw error in +// `last_output` instead of `last_agent_error` (currently `run_agent_job` +// keeps the canned user message in `last_output`), the predicate should +// still classify. Falling back to `last_output` when `last_agent_error` is +// `None` is what guards against that silent-miss case. +#[test] +fn is_session_expired_failure_matches_when_only_output_carries_signal() { + let wire = + r#"OpenHuman API error (401 Unauthorized): {"success":false,"error":"Invalid token"}"#; + assert!(is_session_expired_failure(&JobType::Agent, None, wire)); +} + +// Negative guard: the canned user-facing message that `run_agent_job` +// routes into `last_output` today carries no session signal. The predicate +// must NOT trip on it — otherwise every generic agent failure (provider +// keys missing, tool error, network blip) would halt after one attempt and +// stop reporting to Sentry, defeating the retry semantics for non-401 +// failures. +#[test] +fn is_session_expired_failure_does_not_match_canned_user_message() { + assert!(!is_session_expired_failure( + &JobType::Agent, + Some(AGENT_JOB_USER_FAILURE_MESSAGE), + AGENT_JOB_USER_FAILURE_MESSAGE, + )); +} + +// Negative guard: ordinary provider-error wire text (e.g. a third-party +// model rejecting a request as 400 / 500 / 429) must not be misclassified +// as session expiry. Those failures are exactly what the retry loop + +// `failure=retries_exhausted` capture exist for. +#[test] +fn is_session_expired_failure_does_not_match_ordinary_provider_error() { + let wire = + r#"OpenHuman API error (500 Internal Server Error): {"error":"Internal server error"}"#; + assert!(!is_session_expired_failure(&JobType::Agent, Some(wire), "")); + + let byo_key = r#"OpenAI API error (401 Unauthorized): {"error":{"message":"Invalid API key","type":"invalid_request_error"}}"#; + assert!( + !is_session_expired_failure(&JobType::Agent, Some(byo_key), ""), + "third-party BYO-key 401 is actionable (user misconfigured their key) — must NOT classify as backend session expiry" + ); +} + +// Scope guard: the halt is restricted to `JobType::Agent` because the +// `SessionExpired` publish + scheduler-gate handshake only fires from the +// inference layer. A shell job that happens to echo the 401-shaped string +// (e.g. an operator's curl wrapper printing the backend response verbatim) +// MUST keep its existing retry semantics — the operator may want those +// retries, and the gate has no reason to be flipped from a shell exit. +#[test] +fn is_session_expired_failure_does_not_halt_shell_jobs() { + let wire = + r#"OpenHuman API error (401 Unauthorized): {"success":false,"error":"Invalid token"}"#; + assert!( + !is_session_expired_failure(&JobType::Shell, None, wire), + "shell jobs must retain retry semantics regardless of stdout content" + ); + assert!( + !is_session_expired_failure(&JobType::Shell, Some(wire), wire), + "shell jobs never populate last_agent_error — but even if a future path did, scope stays Agent-only" + ); +} + #[tokio::test] async fn run_agent_job_returns_error_without_provider_key() { let tmp = TempDir::new().unwrap();