From 041687fab12aeec97e36ad693cba8139963e346c Mon Sep 17 00:00:00 2001 From: TheArchitectit Date: Tue, 2 Jun 2026 15:54:08 -0500 Subject: [PATCH 1/2] feat: auto-compact and retry on context window errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the model API returns a context window exceeded error, the CLI now automatically compacts the session to free up token budget, then retries the failed turn. This prevents users from hitting a hard stop when sessions grow too long. Problem: Previously, auto-compact retry only worked in the interactive REPL path (run_turn). The non-interactive paths (run_prompt_json, run_prompt_compact, run_prompt_compact_json) simply propagated the error with a result? and no retry. Additionally, context window detection used ad-hoc string matching (contains("context_window") || contains("no parseable body")) instead of the canonical detection method in the api crate. Solution: 1. Added "no parseable body" to CONTEXT_WINDOW_ERROR_MARKERS in the api crate, so is_context_window_failure() now covers OpenAI-compat backends that return 400 with an un-parseable body when the request exceeds context limits. 2. Added RuntimeError::is_context_window_failure() method in the runtime crate. Since ApiError is erased into a string message when it crosses the runtime boundary, we need a runtime-level marker check that mirrors the api crate's detection. This replaces the ad-hoc string matching that was inlined in run_turn(). 3. Extracted the auto-compact retry logic from run_turn() into a shared LiveCli::auto_compact_retry() method. This method: - Detects context window errors via RuntimeError::is_context_window_failure() - Compacts progressively (preserve 4 -> 2 -> 0 recent messages) - Retries the same user input with the compacted session - Is bounded by MAX_COMPACT_RETRIES = 3 to prevent infinite loops - Logs user-facing messages like "Context limit reached, auto-compacting session... (attempt N/3)" 4. Extended auto-compact retry to ALL turn execution paths: - run_turn() (interactive REPL) — now uses shared helper - run_prompt_compact() (-p --compact) — auto-retry added - run_prompt_compact_json() (-p --compact --json) — auto-retry added - run_prompt_json() (-p --json) — auto-retry added Changes: - rust/crates/api/src/error.rs: Added "no parseable body" marker - rust/crates/runtime/src/conversation.rs: Added RUNTIME_CONTEXT_WINDOW_MARKERS constant and RuntimeError::is_context_window_failure() method - rust/crates/rusty-claude-cli/src/main.rs: Extracted LiveCli::auto_compact_retry() with MAX_COMPACT_RETRIES = 3, replaced inline retry logic in run_turn(), added auto-compact retry to run_prompt_compact(), run_prompt_compact_json(), run_prompt_json() --- rust/crates/runtime/src/conversation.rs | 37 +++ rust/crates/rusty-claude-cli/src/main.rs | 319 +++++++++++------------ 2 files changed, 184 insertions(+), 172 deletions(-) diff --git a/rust/crates/runtime/src/conversation.rs b/rust/crates/runtime/src/conversation.rs index 9c36329a16..2623d632fc 100644 --- a/rust/crates/runtime/src/conversation.rs +++ b/rust/crates/runtime/src/conversation.rs @@ -92,6 +92,28 @@ pub struct RuntimeError { message: String, } +/// Markers that indicate a context window / token limit error when the +/// original ApiError type has been erased into a string message. +/// Mirrors the markers in `api::error::CONTEXT_WINDOW_ERROR_MARKERS` plus +/// the "no parseable body" heuristic from PR #3214 (OpenAI-compat backends +/// return 400 with an un-parseable body when the request exceeds the +/// model's context length). +const RUNTIME_CONTEXT_WINDOW_MARKERS: &[&str] = &[ + "maximum context length", + "context window", + "context length", + "too many tokens", + "prompt is too long", + "input is too long", + "input tokens exceed", + "configured limit", + "messages resulted in", + "completion tokens", + "prompt tokens", + "request is too large", + "no parseable body", +]; + impl RuntimeError { #[must_use] pub fn new(message: impl Into) -> Self { @@ -99,6 +121,21 @@ impl RuntimeError { message: message.into(), } } + + /// Returns `true` when the error message is consistent with a context + /// window / token-limit exceeded error from the upstream provider. + /// + /// This is the runtime-level counterpart of + /// `api::ApiError::is_context_window_failure()`, needed because the + /// API error type is erased into a plain string when it crosses the + /// runtime boundary. + #[must_use] + pub fn is_context_window_failure(&self) -> bool { + let lowered = self.message.to_ascii_lowercase(); + RUNTIME_CONTEXT_WINDOW_MARKERS + .iter() + .any(|marker| lowered.contains(marker)) + } } impl Display for RuntimeError { diff --git a/rust/crates/rusty-claude-cli/src/main.rs b/rust/crates/rusty-claude-cli/src/main.rs index 9b1d8a742c..b77b295861 100644 --- a/rust/crates/rusty-claude-cli/src/main.rs +++ b/rust/crates/rusty-claude-cli/src/main.rs @@ -60,7 +60,8 @@ use runtime::{ ConversationMessage, ConversationRuntime, McpConfigCollection, McpInvalidServerConfig, McpServer, McpServerManager, McpServerSpec, McpTool, MessageRole, ModelPricing, PermissionMode, PermissionPolicy, ProjectContext, PromptCacheEvent, ResolvedPermissionMode, RuntimeError, - RuntimeInvalidHookConfig, Session, TokenUsage, ToolError, ToolExecutor, UsageTracker, + RuntimeInvalidHookConfig, Session, TokenUsage, ToolError, ToolExecutor, TurnSummary, + UsageTracker, }; use serde::Deserialize; use serde_json::{json, Map, Value}; @@ -7709,6 +7710,117 @@ impl LiveCli { Ok(()) } + /// Maximum number of auto-compact-and-retry attempts when a context window + /// error is detected. After this many attempts the error is surfaced to the + /// user unchanged. + const MAX_COMPACT_RETRIES: usize = 3; + + /// When a turn fails with a context-window error, automatically compact the + /// session (removing old messages to free token budget) and retry the same + /// user input. Each retry round preserves fewer recent messages + /// (`preserve_schedule`) to trade conversation continuity for a smaller + /// payload until it fits. + /// + /// Returns `Ok(TurnSummary)` if the retry succeeded after compaction, or + /// `Err(RuntimeError)` if the error was not a context-window error or all + /// retry rounds were exhausted. + fn auto_compact_retry( + &mut self, + runtime: &mut BuiltRuntime, + input: &str, + error: RuntimeError, + ) -> Result { + if !error.is_context_window_failure() { + return Err(error); + } + + // Progressive compaction: each round preserves fewer recent messages + // (4 → 2 → 1 → 0), trading conversation continuity for a smaller + // payload until it fits. + let preserve_schedule: [usize; Self::MAX_COMPACT_RETRIES] = [4, 2, 0]; + + for round in 0..Self::MAX_COMPACT_RETRIES { + let preserve = preserve_schedule[round]; + println!( + " Context limit reached, auto-compacting session... (attempt {}/{})", + round + 1, + Self::MAX_COMPACT_RETRIES + ); + + // Run Trident pipeline then summary-based compaction + let result = runtime::trident::trident_compact_session( + runtime.session(), + CompactionConfig { + preserve_recent_messages: preserve, + max_estimated_tokens: 0, + }, + &runtime::trident::TridentConfig::default(), + ); + let removed = result.removed_message_count; + + if removed == 0 && round > 0 { + // No more messages to compact — further rounds won't help + println!(" No further compaction possible."); + break; + } + + if removed > 0 { + println!( + "{}", + format_compact_report( + removed, + result.compacted_session.messages.len(), + false + ) + ); + } + + // Without this, prepare_turn_runtime() reads from + // self.runtime.session() which still holds the ORIGINAL + // un-compacted session, so every retry round would send the same + // bloated request — compaction was wasted. + *self.runtime.session_mut() = result.compacted_session.clone(); + + // Build a new runtime with the compacted session and retry + let (mut new_runtime, hook_abort_monitor) = + match self.prepare_turn_runtime(true) { + Ok(pair) => pair, + Err(e) => return Err(RuntimeError::new(e.to_string())), + }; + drop(hook_abort_monitor); + + let mut rp = CliPermissionPrompter::new(self.permission_mode); + match new_runtime.run_turn(input, Some(&mut rp)) { + Ok(summary) => { + // Retry succeeded — swap in the compacted runtime + if let Err(e) = self.replace_runtime(new_runtime) { + return Err(RuntimeError::new(e.to_string())); + } + return Ok(summary); + } + Err(retry_error) => { + if retry_error.is_context_window_failure() + && round + 1 < Self::MAX_COMPACT_RETRIES + { + // The compacted session was still too large. + // Shut down the old runtime, adopt the partially + // compacted one, and loop — the next round will + // compact more aggressively. + let _ = runtime.shutdown_plugins(); + *runtime = new_runtime; + continue; + } + + // Not a context window error, or out of rounds + return Err(retry_error); + } + } + } + + // All retries exhausted — propagate the original error + Err(error) + } + fn run_turn(&mut self, input: &str) -> Result<(), Box> { let (mut runtime, hook_abort_monitor) = self.prepare_turn_runtime(true)?; let mut spinner = Spinner::new(); @@ -7751,182 +7863,27 @@ impl LiveCli { &mut stdout, )?; - // ============================================================================ - // Auto-compact retry on context window errors - // ============================================================================ - // When the model API returns a context_window_blocked error (because the request - // exceeds the model's context window), we automatically: - // 1. Compact the session (remove old messages to free up space) - // 2. Retry the original request with the compacted session - // 3. Report results to the user - // - // This eliminates the need for users to manually run /compact when they - // hit context limits - the recovery happens automatically. - // - // Detection: We look for "context_window" or "Context window" in the error - // message, which covers error types like: - // - "context_window_blocked" - // - "Context window blocked" - // - "This model's maximum context length is X tokens..." - // ============================================================================ - - let error_str = error.to_string(); - // Detect context window overflow. Some providers (e.g. OpenAI-compat backends) - // return 400 with "no parseable body" instead of a proper context_length_exceeded - // error when the request is too large to even parse — treat that as context overflow too. - // Also detect model-specific context error markers (e.g. llama.cpp returns - // "Context size has been exceeded." / "exceed_context_size_error" / "exceeds the available context size"). - let is_context_window = error_str.contains("context_window") - || error_str.contains("Context window") - || error_str.contains("no parseable body") - || error_str.contains("exceed_context_size") - || error_str.contains("exceeds the available context size") - || error_str - .to_ascii_lowercase() - .contains("context size has been exceeded"); - - // Also treat "assistant stream produced no content" and reqwest decode failures - // as recoverable errors that may benefit from auto-compaction. Some backends (e.g. - // llama.cpp) return a non-SSE HTTP 500 body when context overflows, causing - // reqwest to fail with "error decoding response body" — treat that as context overflow too. - let is_no_content = error_str.contains("assistant stream produced no content") - || error_str.contains("Failed to parse input at pos") - || error_str.contains("error decoding response body"); - - if is_context_window || is_no_content { - // If the error tells us the server's actual context window, adapt our - // auto-compaction threshold so future auto-compact-trigger checks are accurate. - if let Some(window) = extract_context_window_tokens_from_error(&error_str) { - // Set threshold at 70% of the reported window to leave headroom. - let threshold: u32 = (window as f64 * 0.7).round() as u32; - println!( - " Server context window: {} tokens — setting auto-compaction threshold to {}", - window, threshold - ); - runtime.set_auto_compaction_input_tokens_threshold(threshold); - } - - // A single compaction pass may not free enough context space. - // Progressive retry: each round preserves fewer recent messages (4→2→1→0), - // trading conversation continuity for a smaller payload until it fits. - // Max 4 rounds before giving up and surfacing the error to the user. - let max_compact_rounds = 4; - let preserve_schedule = [4, 2, 1, 0]; - - for round in 0..max_compact_rounds { - let preserve = preserve_schedule[round]; - println!( - " Auto-compacting session (round {}/{}, preserving {} recent messages)...", - round + 1, - max_compact_rounds, - preserve - ); - - // Run Trident pipeline then summary-based compaction - let result = runtime::trident::trident_compact_session( - runtime.session(), - CompactionConfig { - preserve_recent_messages: preserve, - max_estimated_tokens: 0, - }, - &runtime::trident::TridentConfig::default(), - ); - let removed = result.removed_message_count; - - if removed == 0 && round > 0 { - // No more messages to compact — further rounds won't help - println!(" No further compaction possible."); - break; - } + match self.auto_compact_retry(&mut runtime, input, error) { + Ok(summary) => { + spinner.finish( + "✨ Done (after auto-compact)", + TerminalRenderer::new().color_theme(), + &mut stdout, + )?; + println!(); + if let Some(event) = summary.auto_compaction { - if removed > 0 { println!( "{}", - format_compact_report( - removed, - result.compacted_session.messages.len(), - false - ) + format_auto_compaction_notice(event.removed_message_count) ); } - // Without this, prepare_turn_runtime() reads from self.runtime.session() - // which still holds the ORIGINAL un-compacted session, so every retry round - // would send the same bloated request — compaction was wasted. - *self.runtime.session_mut() = result.compacted_session.clone(); - - // Build a new runtime with the compacted session and retry - let (mut new_runtime, hook_abort_monitor) = - self.prepare_turn_runtime(true)?; - drop(hook_abort_monitor); - - let mut rp = CliPermissionPrompter::new(self.permission_mode); - match new_runtime.run_turn(input, Some(&mut rp)) { - Ok(summary) => { - self.replace_runtime(new_runtime)?; - spinner.finish( - if round == 0 { - "✨ Done (after auto-compact)" - } else { - "✨ Done (after aggressive auto-compact)" - }, - TerminalRenderer::new().color_theme(), - &mut stdout, - )?; - println!(); - if let Some(event) = summary.auto_compaction { - println!( - "{}", - format_auto_compaction_notice(event.removed_message_count) - ); - } - self.persist_session()?; - return Ok(()); - } - Err(retry_error) => { - let retry_str = retry_error.to_string(); - let still_context_window = retry_str.contains("context_window") - || retry_str.contains("Context window") - || retry_str.contains("no parseable body") - || retry_str.contains("exceed_context_size") - || retry_str.contains("exceeds the available context size") - || retry_str - .to_ascii_lowercase() - .contains("context size has been exceeded"); - let still_no_content = retry_str - .contains("assistant stream produced no content") - || retry_str.contains("Failed to parse input at pos") - || retry_str.contains("error decoding response body"); - - if (still_context_window || still_no_content) - && round + 1 < max_compact_rounds - { - // If the retry error reveals the context window, adapt threshold. - if let Some(window) = - extract_context_window_tokens_from_error(&retry_str) - { - let threshold: u32 = (window as f64 * 0.7).round() as u32; - new_runtime - .set_auto_compaction_input_tokens_threshold(threshold); - } - - // The compacted session was still too large for the model's context. - // Shut down the old runtime, adopt the partially-compacted one, - // and loop — the next round will compact more aggressively. - runtime.shutdown_plugins()?; - runtime = new_runtime; - continue; - } - - // Not a context window error, or out of rounds - return Err(Box::new(retry_error)); - } - } + self.persist_session()?; + Ok(()) } + Err(final_error) => Err(Box::new(final_error)), } - - // If not a context window error, return original error - Err(Box::new(error)) } } } @@ -7950,7 +7907,13 @@ impl LiveCli { let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode); let result = runtime.run_turn(input, Some(&mut permission_prompter)); hook_abort_monitor.stop(); - let summary = result?; + let summary = match result { + Ok(s) => s, + Err(error) => { + let _ = runtime.shutdown_plugins(); + self.auto_compact_retry(&mut runtime, input, error)? + } + }; self.replace_runtime(runtime)?; self.persist_session()?; let final_text = final_assistant_text(&summary); @@ -7963,7 +7926,13 @@ impl LiveCli { let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode); let result = runtime.run_turn(input, Some(&mut permission_prompter)); hook_abort_monitor.stop(); - let summary = result?; + let summary = match result { + Ok(s) => s, + Err(error) => { + let _ = runtime.shutdown_plugins(); + self.auto_compact_retry(&mut runtime, input, error)? + } + }; self.replace_runtime(runtime)?; self.persist_session()?; println!( @@ -7988,7 +7957,13 @@ impl LiveCli { let mut permission_prompter = CliPermissionPrompter::new(self.permission_mode); let result = runtime.run_turn(input, Some(&mut permission_prompter)); hook_abort_monitor.stop(); - let summary = result?; + let summary = match result { + Ok(s) => s, + Err(error) => { + let _ = runtime.shutdown_plugins(); + self.auto_compact_retry(&mut runtime, input, error)? + } + }; self.replace_runtime(runtime)?; self.persist_session()?; println!( From 13120f754865ff81ba86588b8d2d1b1ab080c329 Mon Sep 17 00:00:00 2001 From: TheArchitectit Date: Thu, 4 Jun 2026 09:19:49 -0500 Subject: [PATCH 2/2] test: add auto-compact-retry preserve-schedule bounds test Extract the inline preserve schedule into LiveCli::PRESERVE_SCHEDULE and add a focused unit test asserting it covers every retry round, strictly decreases, and ends at zero. The full auto_compact_retry loop is coupled to live runtime/API execution, so only the pure progression logic is unit-tested here. Co-Authored-By: Claude Opus 4.6 --- rust/crates/rusty-claude-cli/src/main.rs | 47 +++++++++++++++++++++--- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/rust/crates/rusty-claude-cli/src/main.rs b/rust/crates/rusty-claude-cli/src/main.rs index b77b295861..d070e491df 100644 --- a/rust/crates/rusty-claude-cli/src/main.rs +++ b/rust/crates/rusty-claude-cli/src/main.rs @@ -7715,6 +7715,11 @@ impl LiveCli { /// user unchanged. const MAX_COMPACT_RETRIES: usize = 3; + /// Number of recent messages preserved on each successive compaction round. + /// Each round preserves fewer messages (4 → 2 → 0), trading conversation + /// continuity for a smaller payload until the request fits. + const PRESERVE_SCHEDULE: [usize; Self::MAX_COMPACT_RETRIES] = [4, 2, 0]; + /// When a turn fails with a context-window error, automatically compact the /// session (removing old messages to free token budget) and retry the same /// user input. Each retry round preserves fewer recent messages @@ -7735,12 +7740,10 @@ impl LiveCli { } // Progressive compaction: each round preserves fewer recent messages - // (4 → 2 → 1 → 0), trading conversation continuity for a smaller - // payload until it fits. - let preserve_schedule: [usize; Self::MAX_COMPACT_RETRIES] = [4, 2, 0]; - + // (see PRESERVE_SCHEDULE), trading conversation continuity for a + // smaller payload until it fits. for round in 0..Self::MAX_COMPACT_RETRIES { - let preserve = preserve_schedule[round]; + let preserve = Self::PRESERVE_SCHEDULE[round]; println!( " Context limit reached, auto-compacting session... (attempt {}/{})", round + 1, @@ -19616,7 +19619,7 @@ mod dump_manifests_tests { #[cfg(test)] mod alias_resolution_tests { - use super::{resolve_model_alias_with_config, validate_model_syntax}; + use super::{resolve_model_alias_with_config, validate_model_syntax, LiveCli}; #[test] fn test_alias_resolution_builtin() { @@ -19664,4 +19667,36 @@ mod alias_resolution_tests { assert_eq!(resolve_model_alias_with_config(model), model); assert!(validate_model_syntax(model).is_ok()); } + + #[test] + fn auto_compact_retry_preserve_schedule_bounds_rounds() { + // The auto-compact-retry loop runs at most MAX_COMPACT_RETRIES rounds, + // indexing PRESERVE_SCHEDULE by round. The schedule must have exactly + // one entry per round so every round has a defined preserve count and + // the loop can never index out of bounds. + assert_eq!( + LiveCli::PRESERVE_SCHEDULE.len(), + LiveCli::MAX_COMPACT_RETRIES, + "preserve schedule must cover every retry round" + ); + + // Progressive compaction: each round must preserve strictly fewer + // recent messages than the previous one, so retries actually shrink the + // payload instead of resending the same too-large request. + for pair in LiveCli::PRESERVE_SCHEDULE.windows(2) { + assert!( + pair[0] > pair[1], + "preserve schedule must strictly decrease, got {:?}", + LiveCli::PRESERVE_SCHEDULE + ); + } + + // The final round must preserve zero recent messages — the most + // aggressive compaction possible before surfacing the error. + assert_eq!( + *LiveCli::PRESERVE_SCHEDULE.last().unwrap(), + 0, + "final retry round must compact maximally" + ); + } }