tinyhumansai · CodeGhost21 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -305,17 +305,64 @@ pub(crate) async fn run_one_tool(
             }
         }
         Ok(Err(e)) => {
-            crate::core::observability::report_error(
-                &e,
-                "tool",
-                "execute",
-                &[
-                    ("tool", call.name.as_str()),
-                    ("outcome", "failed"),
-                    ("iteration", &(iteration + 1).to_string()),
-                ],
-            );
-            (format!("Error executing {}: {e}", call.name), false)
+            // Distinguish user-state failures (out of credits, missing
+            // required field, toolkit not enabled, …) from system / product
+            // failures. Tools that call the integrations backend attach a
+            // typed `BackendUserStateError` marker for the user-state case
+            // (see `openhuman::integrations::client`). For these:
+            //
+            // 1. Route through `report_error_or_expected` instead of the
+            //    always-capture `report_error`. The breadcrumb classifier
+            //    demotes the buckets we care about
+            //    (`BackendUserError` / `BudgetExhausted` /
+            //    `ProviderUserState`) to a `warn` — Sentry sees one
+            //    demoted breadcrumb per turn instead of the always-error
+            //    capture. We don't *silently* skip — observability still
+            //    sees the failure, it's just classified correctly.
+            //
+            // 2. Embed `BACKEND_USER_STATE_MARKER` in the LLM-visible
+            //    result text so the caller-side
+            //    [`crate::openhuman::agent::harness::tool_loop::RepeatFailureGuard`]
+            //    halts the agent loop on the FIRST occurrence with a
+            //    "user must act" message. Without this, the agent flooded
+            //    Sentry with ~19 retries per turn for ~9 users
+            //    (TAURI-RUST-5KG, ~1860 hits) because the per-(tool,args)
+            //    breaker can't catch varying queries and the consecutive
+            //    breaker resets on interleaved success. The real bug isn't
+            //    the Sentry noise — it's the runaway retry. Halting kills
+            //    both.
+            if crate::openhuman::integrations::is_backend_user_state_error(&e) {
+                crate::core::observability::report_error_or_expected(
+                    &e,
+                    "tool",
+                    "execute",
+                    &[
+                        ("tool", call.name.as_str()),
+                        ("outcome", "failed_user_state"),
+                        ("iteration", &(iteration + 1).to_string()),
+                    ],
+                );
+                (
+                    format!(
+                        "{} Error executing {}: {e}",
+                        super::super::tool_loop::BACKEND_USER_STATE_MARKER,
+                        call.name,
+                    ),
+                    false,
+                )
+            } else {
+                crate::core::observability::report_error(
+                    &e,
+                    "tool",
+                    "execute",
+                    &[
+                        ("tool", call.name.as_str()),
+                        ("outcome", "failed"),
+                        ("iteration", &(iteration + 1).to_string()),
+                    ],
+                );
+                (format!("Error executing {}: {e}", call.name), false)
+            }
         }
         Err(_) => {
             let msg = format!(

@@ -41,6 +41,29 @@ pub(crate) const NO_PROGRESS_FAILURE_THRESHOLD: u32 = 6;
 /// and pivot to a different, allowed approach.
 pub(crate) const HARD_REJECT_REPEAT_THRESHOLD: u32 = 2;
 
+/// Stable marker the tool runner embeds in a failed tool result when the
+/// underlying error carries the typed
+/// [`crate::openhuman::integrations::BackendUserStateError`] — i.e. the backend
+/// returned a deterministic *user-state* failure (insufficient balance, missing
+/// required field, toolkit not enabled, sign-in expired, …).
+///
+/// Unlike `(tool, args)`-coupled hard rejects, the underlying condition is
+/// **global**: the user's wallet is empty, or a toolkit they don't control is
+/// disabled. Retrying the same tool with a different query, or pivoting to a
+/// different paid integration, cannot resolve it — only the user can. The
+/// breaker therefore halts on the **first** occurrence rather than allowing
+/// the LLM to grind through a generic [`REPEAT_FAILURE_THRESHOLD`] of
+/// indistinguishable failures.
+///
+/// This is the actual fix for TAURI-RUST-5KG: `web_search_tool` flooded
+/// Sentry with ~19 events/turn × 9 users (1860 hits) because the agent kept
+/// retrying after a `400 Insufficient balance` — varying the search query so
+/// the per-`(tool,args)` breaker never tripped and interleaving the failures
+/// with succeeding tool calls so the consecutive-failure breaker reset. Halt
+/// on first occurrence stops the runaway directly instead of just routing
+/// the captured errors away from Sentry.
+pub(crate) const BACKEND_USER_STATE_MARKER: &str = "[backend-user-state]";
+
 /// Classification of a deterministic, recognizable policy rejection, detected via
 /// the stable markers the security/approval layers emit
 /// ([`crate::openhuman::security::POLICY_BLOCKED_MARKER`] /
@@ -108,6 +131,25 @@ impl RepeatFailureGuard {
             *c += 1;
             *c
         };
+        // Backend user-state failures (insufficient balance, toolkit disabled,
+        // missing required field, …) are a *global* deterministic condition
+        // the agent cannot resolve — only the user can. Halt on the FIRST
+        // occurrence rather than letting the model burn the generic
+        // identical-retry threshold by varying args, and rather than waiting
+        // for the consecutive-failure breaker (which resets on any interleaved
+        // success). See [`BACKEND_USER_STATE_MARKER`] for the TAURI-RUST-5KG
+        // background.
+        if result.contains(BACKEND_USER_STATE_MARKER) {
+            return Some(format!(
+                "Stopping: the `{tool}` call returned a backend user-state error \
+                 — this is a deterministic condition that requires user action \
+                 (e.g. add credits, enable the toolkit, sign in). Retrying \
+                 with different arguments or a different paid tool will not \
+                 help. Reason:\n{}\n\nReport this back to the user instead of \
+                 trying alternative tools.",
+                truncate_for_halt(result),
+            ));
+        }
         // Hard policy rejections trip on the first verbatim repeat; everything
         // else uses the generic identical-retry threshold.
         let hard = hard_reject_kind(result);

@@ -1107,6 +1107,109 @@ fn hard_reject_distinct_args_do_not_trip_repeat() {
     );
 }
 
+// -- Backend user-state marker (TAURI-RUST-5KG, halt on FIRST occurrence) -------
+
+#[test]
+fn backend_user_state_marker_halts_on_first_occurrence() {
+    // The actual fix for TAURI-RUST-5KG (`web_search_tool` flooded Sentry with
+    // ~1860 events because the agent retried 19× per turn on a 400 Insufficient
+    // balance). Unlike hard policy rejects (which let the model try once and
+    // halt on the verbatim repeat), the underlying condition is *global* —
+    // varying the query or pivoting to a different paid tool cannot resolve
+    // an empty wallet. Halt immediately.
+    // `BACKEND_USER_STATE_MARKER` is `pub(crate)` in the parent `tool_loop`
+    // module and reached here via `use super::*;` at the top of this file.
+    let mut g = RepeatFailureGuard::new();
+    let body = format!(
+        "{BACKEND_USER_STATE_MARKER} Error executing web_search_tool: \
+         Backend returned 400 Bad Request for POST /agent-integrations/parallel/search: \
+         Insufficient balance"
+    );
+    let halt = g.record(
+        "web_search_tool",
+        "{\"query\":\"latest news\"}",
+        false,
+        &body,
+    );
+    assert!(
+        halt.is_some(),
+        "first backend-user-state failure must halt — retries can never resolve a global condition"
+    );
+    let msg = halt.unwrap();
+    assert!(
+        msg.contains("backend user-state error"),
+        "halt summary should label the failure class; got: {msg}"
+    );
+    assert!(
+        msg.contains("requires user action"),
+        "halt summary should tell the model to surface it to the user; got: {msg}"
+    );
+    assert!(
+        msg.contains("Insufficient balance"),
+        "halt summary should preserve the actionable root cause; got: {msg}"
+    );
+}
+
+#[test]
+fn backend_user_state_marker_halts_regardless_of_args() {
+    // The whole point of the first-occurrence semantic: the failure is global,
+    // so a *different* query that re-hits the same condition is still doomed.
+    // Pin that the breaker doesn't wait for an identical `(tool, args)` repeat
+    // the way the generic [`REPEAT_FAILURE_THRESHOLD`] would.
+    // `BACKEND_USER_STATE_MARKER` is `pub(crate)` in the parent `tool_loop`
+    // module and reached here via `use super::*;` at the top of this file.
+    let mut g = RepeatFailureGuard::new();
+    let body = format!("{BACKEND_USER_STATE_MARKER} Error: Insufficient balance");
+    assert!(
+        g.record("web_search_tool", "{\"query\":\"q1\"}", false, &body)
+            .is_some(),
+        "first call with a backend-user-state marker must halt even though \
+         (tool, args) hasn't repeated yet"
+    );
+}
+
+#[test]
+fn backend_user_state_marker_takes_precedence_over_generic_threshold() {
+    // If we ever broke ordering and the marker check sat *after* the generic
+    // `(tool, args)` repeat-threshold gate, a first-occurrence user-state
+    // failure would silently fall through to "not enough repeats yet" and the
+    // agent would keep retrying — defeating the whole point. Lock the order
+    // in with a direct assertion.
+    // `BACKEND_USER_STATE_MARKER` is `pub(crate)` in the parent `tool_loop`
+    // module and reached here via `use super::*;` at the top of this file.
+    let mut g = RepeatFailureGuard::new();
+    // Same (tool, args, body) shape that the closed PR would have allowed
+    // through 2 more times. With the marker check first, count=1 already
+    // trips. count==1 < REPEAT_FAILURE_THRESHOLD==3 → without the new gate
+    // this test would fail.
+    let body = format!("{BACKEND_USER_STATE_MARKER} Insufficient balance");
+    let halt = g.record("web_search_tool", "{}", false, &body);
+    assert!(halt.is_some(), "count=1 user-state failure must halt now");
+}
+
+#[test]
+fn backend_user_state_unmarked_failures_use_normal_threshold() {
+    // Regression guard: a tool error that *doesn't* carry the marker must
+    // continue to use the generic 3-attempt repeat threshold. The new gate
+    // should not widen to affect ordinary tool failures.
+    let mut g = RepeatFailureGuard::new();
+    // Backend-shaped 5xx — no marker, since `classify_as_user_state` rejects
+    // these and `is_backend_user_state_error` is false → no prefix added.
+    let body = "Error executing tool_x: Backend returned 500 for POST /foo: upstream blew up";
+    assert!(
+        g.record("tool_x", "{}", false, body).is_none(),
+        "first non-marker failure should not halt"
+    );
+    assert!(
+        g.record("tool_x", "{}", false, body).is_none(),
+        "second non-marker failure should not halt"
+    );
+    assert!(
+        g.record("tool_x", "{}", false, body).is_some(),
+        "third identical non-marker failure must halt via the 3-attempt threshold"
+    );
+}
+
 /// Provider that records the tool-spec names of every `chat()` request
 /// it sees, then returns the next scripted response.
 struct CapturingProvider {