From b9dd2d419c89f39972d03843e7e48fd93f5cc57e Mon Sep 17 00:00:00 2001
From: Ghost Scripter <ghostscripter@zerolend.xyz>
Date: Thu, 4 Jun 2026 15:15:22 +0530
Subject: [PATCH] fix(cron): halt agent-job retry loop on backend
 session-expired (TAURI-RUST-N)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the user's OpenHuman backend JWT lapses, the inference layer's
`api_error` already publishes `DomainEvent::SessionExpired` for the 401
and intentionally skips its own `report_error` call. But the cron retry
loop in `execute_job_with_retry` doesn't consult that signal — it sleeps
with exponential backoff, retries the same job N times (every attempt
hitting the same global 401), then unconditionally calls `report_error`
with `failure=retries_exhausted`. That generated TAURI-RUST-N: 7,038
events / 5 users, all `domain=cron operation=agent_job` from a
`morning_briefing` agent grinding through retries after one lapse.

Fix mirrors PR #3334's halt-on-first-occurrence pattern, but at the
cron retry layer instead of the agent tool loop:

- New `is_session_expired_failure` predicate consults the existing
  `core::observability::is_session_expired_message` classifier (the
  `OpenHuman API error (401` + `"error":"Invalid token"` conjunction
  was already added for OPENHUMAN-TAURI-4P0). Matches on
  `last_agent_error` first (carries the raw provider wire chain),
  falls back to `last_output` for defense-in-depth.
- `execute_job_with_retry` breaks out of the loop on the first
  occurrence — no further attempts, no `report_error` call. The
  inference layer's `SessionExpired` publish already drives the
  credentials/scheduler-gate handshake; retries can't recover until
  the user re-auths.
- Restricted to `JobType::Agent`: shell jobs that happen to echo a
  401-shaped string keep their existing retry semantics (no
  `SessionExpired` publish from shell stdout, no reason to flip the
  gate).

Five focused unit tests pin the predicate behaviour: the 401 wire
shape trips the halt via `last_agent_error` or `last_output`; the
canned user message, ordinary provider errors (incl. third-party
BYO-key 401s), and shell-job invocations all stay on the retry path.

Sentry-Issue: TAURI-RUST-N
---
 src/openhuman/cron/scheduler.rs       | 66 +++++++++++++++++++++-
 src/openhuman/cron/scheduler_tests.rs | 81 +++++++++++++++++++++++++++
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/src/openhuman/cron/scheduler.rs b/src/openhuman/cron/scheduler.rs
index 8202dd0d19..bc6ffba5f1 100644
--- a/src/openhuman/cron/scheduler.rs
+++ b/src/openhuman/cron/scheduler.rs
@@ -192,6 +192,55 @@ pub async fn execute_job_now(config: &Config, job: &CronJob) -> (bool, String) {
     execute_job_with_retry(config, &security, job).await
 }
 
+/// Did this failed agent-job attempt hit the backend session-expired state?
+///
+/// When the OpenHuman backend returns 401 because the user's app JWT has
+/// lapsed, [`inference::provider::ops::api_error`] already publishes
+/// [`crate::core::event_bus::DomainEvent::SessionExpired`] (via
+/// `publish_backend_session_expired`) and the credentials subscriber clears
+/// the stored session + flips the scheduler-gate `signed_out` override. The
+/// gate then halts downstream LLM work until the user re-auths.
+///
+/// The cron retry loop pre-dates that gate handshake: it sleeps with
+/// exponential backoff and retries the same job N times, every attempt
+/// hitting the same global 401, then calls `report_error` with
+/// `failure=retries_exhausted`. That generated TAURI-RUST-N (7,038 events /
+/// 5 users): a cron-fired `morning_briefing` agent grinding through retries
+/// after a single JWT lapse, every retries-exhausted capture pointing at a
+/// problem the user can only fix from the UI.
+///
+/// The right move is the same halt-on-first-occurrence pattern as
+/// `agent::harness::tool_loop::BACKEND_USER_STATE_MARKER` (#3334): the
+/// condition is global and retries can't recover it, so we stop after the
+/// first attempt. Skipping the `report_error` call too is correct because
+/// the existing classifier
+/// [`crate::core::observability::is_session_expired_message`] already
+/// considers this expected user state (`observability.rs` — anchored on
+/// `OpenHuman API error (401` + `"error":"Invalid token"`).
+///
+/// We match on `last_agent_error` first because cron's `run_agent_job`
+/// routes the raw anyhow chain there (containing the provider's wire
+/// message), while `last_output` only carries the canned user-facing
+/// notification (`AGENT_JOB_USER_FAILURE_MESSAGE` / per-variant copy). For
+/// the canned-message branch we still fall back to `last_output` so a
+/// future code path that surfaces the raw error there isn't a silent miss.
+///
+/// Restricted to `JobType::Agent`: shell jobs that happen to echo a
+/// 401-shaped string don't go through the inference layer's
+/// `SessionExpired` publish, so halting them based on stdout would skip
+/// retries the operator may want.
+fn is_session_expired_failure(
+    job_type: &JobType,
+    last_agent_error: Option<&str>,
+    last_output: &str,
+) -> bool {
+    if !matches!(job_type, JobType::Agent) {
+        return false;
+    }
+    let signal = last_agent_error.unwrap_or(last_output);
+    crate::core::observability::is_session_expired_message(signal)
+}
+
 async fn execute_job_with_retry(
     config: &Config,
     security: &SecurityPolicy,
@@ -201,6 +250,7 @@ async fn execute_job_with_retry(
     let mut last_agent_error: Option<String> = None;
     let retries = config.reliability.scheduler_retries;
     let mut backoff_ms = config.reliability.provider_backoff_ms.max(200);
+    let mut session_expired = false;
 
     for attempt in 0..=retries {
         let (success, output, agent_error) = match job.job_type {
@@ -224,6 +274,20 @@ async fn execute_job_with_retry(
             return (false, last_output);
         }
 
+        if is_session_expired_failure(
+            &job.job_type,
+            last_agent_error.as_deref(),
+            last_output.as_str(),
+        ) {
+            // Halt on the first occurrence — the inference layer already
+            // published `SessionExpired`, retries cannot recover until the
+            // user re-auths, and the classifier considers this expected
+            // user state (TAURI-RUST-N). See `is_session_expired_failure`
+            // for the full rationale.
+            session_expired = true;
+            break;
+        }
+
         if attempt < retries {
             let jitter_ms = u64::from(Utc::now().timestamp_subsec_millis() % 250);
             time::sleep(Duration::from_millis(backoff_ms + jitter_ms)).await;
@@ -231,7 +295,7 @@ async fn execute_job_with_retry(
         }
     }
 
-    if matches!(job.job_type, JobType::Agent) {
+    if matches!(job.job_type, JobType::Agent) && !session_expired {
         let report_message = last_agent_error
             .as_deref()
             .unwrap_or_else(|| last_output.as_str());
diff --git a/src/openhuman/cron/scheduler_tests.rs b/src/openhuman/cron/scheduler_tests.rs
index 31e82d8a13..f547d5ac5e 100644
--- a/src/openhuman/cron/scheduler_tests.rs
+++ b/src/openhuman/cron/scheduler_tests.rs
@@ -302,6 +302,87 @@ async fn execute_job_with_retry_exhausts_attempts() {
     assert!(output.contains("always_missing_for_retry_test"));
 }
 
+// TAURI-RUST-N — backend 401 ("Invalid token") leaks from a cron-fired agent
+// job through `last_agent_error` and the existing classifier in
+// `core::observability::is_session_expired_message` matches it (the
+// `OpenHuman API error (401` + `"error":"Invalid token"` conjunction was added
+// for OPENHUMAN-TAURI-4P0). `is_session_expired_failure` MUST consult that
+// classifier so the cron retry loop halts on the first occurrence instead of
+// retrying N times and reporting `failure=retries_exhausted` to Sentry.
+#[test]
+fn is_session_expired_failure_matches_openhuman_backend_401_in_agent_error() {
+    let wire =
+        r#"OpenHuman API error (401 Unauthorized): {"success":false,"error":"Invalid token"}"#;
+    assert!(
+        is_session_expired_failure(&JobType::Agent, Some(wire), AGENT_JOB_USER_FAILURE_MESSAGE),
+        "raw agent error carrying the 401 wire shape must trip the halt"
+    );
+}
+
+// Defense-in-depth: if a future code path ever surfaces the raw error in
+// `last_output` instead of `last_agent_error` (currently `run_agent_job`
+// keeps the canned user message in `last_output`), the predicate should
+// still classify. Falling back to `last_output` when `last_agent_error` is
+// `None` is what guards against that silent-miss case.
+#[test]
+fn is_session_expired_failure_matches_when_only_output_carries_signal() {
+    let wire =
+        r#"OpenHuman API error (401 Unauthorized): {"success":false,"error":"Invalid token"}"#;
+    assert!(is_session_expired_failure(&JobType::Agent, None, wire));
+}
+
+// Negative guard: the canned user-facing message that `run_agent_job`
+// routes into `last_output` today carries no session signal. The predicate
+// must NOT trip on it — otherwise every generic agent failure (provider
+// keys missing, tool error, network blip) would halt after one attempt and
+// stop reporting to Sentry, defeating the retry semantics for non-401
+// failures.
+#[test]
+fn is_session_expired_failure_does_not_match_canned_user_message() {
+    assert!(!is_session_expired_failure(
+        &JobType::Agent,
+        Some(AGENT_JOB_USER_FAILURE_MESSAGE),
+        AGENT_JOB_USER_FAILURE_MESSAGE,
+    ));
+}
+
+// Negative guard: ordinary provider-error wire text (e.g. a third-party
+// model rejecting a request as 400 / 500 / 429) must not be misclassified
+// as session expiry. Those failures are exactly what the retry loop +
+// `failure=retries_exhausted` capture exist for.
+#[test]
+fn is_session_expired_failure_does_not_match_ordinary_provider_error() {
+    let wire =
+        r#"OpenHuman API error (500 Internal Server Error): {"error":"Internal server error"}"#;
+    assert!(!is_session_expired_failure(&JobType::Agent, Some(wire), ""));
+
+    let byo_key = r#"OpenAI API error (401 Unauthorized): {"error":{"message":"Invalid API key","type":"invalid_request_error"}}"#;
+    assert!(
+        !is_session_expired_failure(&JobType::Agent, Some(byo_key), ""),
+        "third-party BYO-key 401 is actionable (user misconfigured their key) — must NOT classify as backend session expiry"
+    );
+}
+
+// Scope guard: the halt is restricted to `JobType::Agent` because the
+// `SessionExpired` publish + scheduler-gate handshake only fires from the
+// inference layer. A shell job that happens to echo the 401-shaped string
+// (e.g. an operator's curl wrapper printing the backend response verbatim)
+// MUST keep its existing retry semantics — the operator may want those
+// retries, and the gate has no reason to be flipped from a shell exit.
+#[test]
+fn is_session_expired_failure_does_not_halt_shell_jobs() {
+    let wire =
+        r#"OpenHuman API error (401 Unauthorized): {"success":false,"error":"Invalid token"}"#;
+    assert!(
+        !is_session_expired_failure(&JobType::Shell, None, wire),
+        "shell jobs must retain retry semantics regardless of stdout content"
+    );
+    assert!(
+        !is_session_expired_failure(&JobType::Shell, Some(wire), wire),
+        "shell jobs never populate last_agent_error — but even if a future path did, scope stays Agent-only"
+    );
+}
+
 #[tokio::test]
 async fn run_agent_job_returns_error_without_provider_key() {
     let tmp = TempDir::new().unwrap();