From d1b9e4598cb8a7291116268740c7c1f15ac91d77 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 24 Apr 2026 11:12:04 +0900 Subject: [PATCH 1/4] fix: extend invalid-model hint for qwen prefix to mention local Ollama path When users type bare qwen model names (e.g., qwen3.5:9b), the validator already suggested qwen/ prefix for DashScope. Now it also documents the local Ollama path: use openai/ prefix with OPENAI_BASE_URL and OPENAI_API_KEY=ollama. This addresses the confusion observed when debugging Ollama integration after the upstream merge, where users might follow the DashScope hint but actually want to run models locally via Ollama. --- rust/crates/rusty-claude-cli/src/main.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rust/crates/rusty-claude-cli/src/main.rs b/rust/crates/rusty-claude-cli/src/main.rs index 38974eb5f3..75765382fb 100644 --- a/rust/crates/rusty-claude-cli/src/main.rs +++ b/rust/crates/rusty-claude-cli/src/main.rs @@ -2943,6 +2943,11 @@ fn validate_model_syntax(model: &str) -> Result<(), String> { err_msg.push_str("\nDid you mean `qwen/"); err_msg.push_str(trimmed); err_msg.push_str("`? (Requires DASHSCOPE_API_KEY env var)"); + err_msg.push_str("\nOr for a local Ollama server: use `openai/"); + err_msg.push_str(trimmed); + err_msg.push_str( + "` with OPENAI_BASE_URL=http://127.0.0.1:11434/v1 and OPENAI_API_KEY=ollama", + ); } else if trimmed.starts_with("grok") { err_msg.push_str("\nDid you mean `xai/"); err_msg.push_str(trimmed); From 5d67e76788294f676d566c8f90243cbe83e0d649 Mon Sep 17 00:00:00 2001 From: "Heo, Sung" Date: Sat, 6 Jun 2026 08:09:18 +0900 Subject: [PATCH 2/4] fix(api): surface Ollama `reasoning` field for Qwen3 thinking models Ollama's Qwen3 thinking models stream the chain-of-thought in a `reasoning` field with an empty `content`. The OpenAI-compatible provider only recognised `reasoning_content`/`thinking`, so reasoning-only responses surfaced no text at all. Add `reasoning` to both the streaming (`ChunkDelta`) and non-streaming (`ChatMessage`) shapes and fall back to it when extracting thinking, preserving the existing thinking/text content-block separation. Covered by two new tests for the streaming and non-streaming paths. Co-Authored-By: Claude Opus 4.8 --- .../crates/api/src/providers/openai_compat.rs | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/rust/crates/api/src/providers/openai_compat.rs b/rust/crates/api/src/providers/openai_compat.rs index 7f82d00ceb..9788742447 100644 --- a/rust/crates/api/src/providers/openai_compat.rs +++ b/rust/crates/api/src/providers/openai_compat.rs @@ -562,6 +562,7 @@ impl StreamState { .delta .reasoning_content .filter(|value| !value.is_empty()) + .or(choice.delta.reasoning.filter(|value| !value.is_empty())) .or(choice .delta .thinking @@ -816,6 +817,9 @@ struct ChatMessage { content: Option, #[serde(default)] reasoning_content: Option, + /// Ollama's Qwen3 thinking models stream chain-of-thought here with `content=""`. + #[serde(default)] + reasoning: Option, #[serde(default)] tool_calls: Vec, } @@ -890,6 +894,9 @@ struct ChunkDelta { /// Some providers (GLM, DeepSeek) emit reasoning in `reasoning_content` #[serde(default)] reasoning_content: Option, + /// Ollama's Qwen3 thinking models stream tokens here with `content=""`. + #[serde(default)] + reasoning: Option, #[serde(default)] thinking: Option, #[serde(default, deserialize_with = "deserialize_null_as_empty_vec")] @@ -1500,6 +1507,7 @@ fn normalize_response( .message .reasoning_content .filter(|value| !value.is_empty()) + .or_else(|| choice.message.reasoning.filter(|value| !value.is_empty())) { content.push(OutputContentBlock::Thinking { thinking, @@ -1982,6 +1990,7 @@ mod tests { role: "assistant".to_string(), content: Some("final answer".to_string()), reasoning_content: Some("hidden thought".to_string()), + reasoning: None, tool_calls: Vec::new(), }, finish_reason: Some("stop".to_string()), @@ -2007,6 +2016,112 @@ mod tests { ); } + #[test] + fn non_streaming_response_with_ollama_reasoning_field_emits_thinking_block() { + // Given an Ollama-style response that carries the chain-of-thought in + // `reasoning` (not `reasoning_content`) — Qwen3 thinking models emit an + // empty `content` and put the trace in `reasoning`. + let response = super::ChatCompletionResponse { + id: "chatcmpl_ollama".to_string(), + model: "qwen3.5:9b".to_string(), + choices: vec![super::ChatChoice { + message: super::ChatMessage { + role: "assistant".to_string(), + content: Some("43".to_string()), + reasoning_content: None, + reasoning: Some("17 + 26 = 43".to_string()), + tool_calls: Vec::new(), + }, + finish_reason: Some("stop".to_string()), + }], + usage: None, + }; + + // When normalizing the provider response. + let normalized = normalize_response("qwen3.5:9b", response).expect("normalized"); + + // Then the `reasoning` text surfaces as a Thinking block before the text. + assert_eq!( + normalized.content, + vec![ + OutputContentBlock::Thinking { + thinking: "17 + 26 = 43".to_string(), + signature: None, + }, + OutputContentBlock::Text { + text: "43".to_string(), + }, + ] + ); + } + + #[test] + fn streaming_chunks_with_ollama_reasoning_field_emit_thinking_before_text() { + // Given Ollama Qwen3 streaming chunks carrying the chain-of-thought in + // the `reasoning` field with an empty `content`. + let mut state = StreamState::new("qwen3.5:9b".to_string()); + let mut events = state + .ingest_chunk(super::ChatCompletionChunk { + id: "chatcmpl_ollama_stream".to_string(), + model: Some("qwen3.5:9b".to_string()), + choices: vec![super::ChunkChoice { + delta: super::ChunkDelta { + content: None, + reasoning_content: None, + reasoning: Some("thinking".to_string()), + thinking: None, + tool_calls: Vec::new(), + }, + finish_reason: None, + }], + usage: None, + }) + .expect("reasoning chunk"); + events.extend( + state + .ingest_chunk(super::ChatCompletionChunk { + id: "chatcmpl_ollama_stream".to_string(), + model: None, + choices: vec![super::ChunkChoice { + delta: super::ChunkDelta { + content: Some("43".to_string()), + reasoning_content: None, + reasoning: None, + thinking: None, + tool_calls: Vec::new(), + }, + finish_reason: Some("stop".to_string()), + }], + usage: None, + }) + .expect("text chunk"), + ); + events.extend(state.finish().expect("finish")); + + // Then a Thinking block (index 0) is emitted before the Text block (index 1). + assert!(matches!( + events[1], + StreamEvent::ContentBlockStart(ContentBlockStartEvent { + index: 0, + content_block: OutputContentBlock::Thinking { .. }, + }) + )); + assert!(events.iter().any(|event| matches!( + event, + StreamEvent::ContentBlockDelta(ContentBlockDeltaEvent { + index: 0, + delta: ContentBlockDelta::ThinkingDelta { .. }, + }) + ))); + assert!(events.iter().any(|event| matches!( + event, + StreamEvent::ContentBlockStart(ContentBlockStartEvent { + index: 1, + content_block: OutputContentBlock::Text { .. }, + }) + ))); + } + #[test] fn streaming_chunks_with_reasoning_content_emit_thinking_block_events_before_text() { // Given streaming chunks with reasoning_content followed by text. @@ -2019,6 +2134,7 @@ mod tests { delta: super::ChunkDelta { content: None, reasoning_content: Some("think".to_string()), + reasoning: None, thinking: None, tool_calls: Vec::new(), }, @@ -2036,6 +2152,7 @@ mod tests { delta: super::ChunkDelta { content: Some(" answer".to_string()), reasoning_content: None, + reasoning: None, thinking: None, tool_calls: Vec::new(), }, From 65e6e9795b644d54cdb30f9506110ce647bde9dd Mon Sep 17 00:00:00 2001 From: "Heo, Sung" Date: Sat, 6 Jun 2026 08:09:18 +0900 Subject: [PATCH 3/4] docs(readme): add WSL + Ollama local-model setup guide Document running claw against a local Ollama server over its OpenAI-compatible endpoint: build, env vars (OPENAI_BASE_URL / OPENAI_API_KEY), the exact colon-tag model-name requirement (`qwen3.5:9b`, not `qwen3.5-9b`), how to list installed tags, a WSL connectivity note, and reasoning-model behaviour. Co-Authored-By: Claude Opus 4.8 --- rust/README.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/rust/README.md b/rust/README.md index edcd4fefc1..9d6d95d2bc 100644 --- a/rust/README.md +++ b/rust/README.md @@ -40,6 +40,48 @@ Or provide an OAuth bearer token directly: export ANTHROPIC_AUTH_TOKEN="anthropic-oauth-or-proxy-bearer-token" ``` +### Local models via Ollama (WSL) + +Claw works with any OpenAI-compatible endpoint, including a local +[Ollama](https://ollama.com) server. This is handy for offline use or for +driving the CLI from WSL against models served on the Windows host. + +```bash +cd rust/ + +# 1. Build the release binary +cargo build --release + +# 2. Point claw at the local Ollama OpenAI-compatible endpoint +export OPENAI_BASE_URL="http://127.0.0.1:11434/v1" +export OPENAI_API_KEY="ollama" # any non-empty value works; Ollama ignores it + +# 3. Run. Prefix the model with `openai/` so prefix routing selects the +# OpenAI-compatible provider, and use the exact Ollama tag. +./target/release/claw --model openai/qwen3.5:9b +``` + +**The model name must match the Ollama tag exactly.** Ollama tags use a colon +(`name:tag`), e.g. `qwen3.5:9b` — not a hyphen (`qwen3.5-9b`), which the server +rejects with `404 ... model not found`. Routing to the OpenAI-compatible +provider only kicks in when `OPENAI_BASE_URL` is set and the model name contains +a `:` or `.`, so always pass the full tag. List the installed tags with: + +```bash +ollama list # native view +curl -s http://127.0.0.1:11434/v1/models # OpenAI-compatible view +``` + +> **WSL note:** when Ollama runs on the Windows host, WSL 2 forwards +> `127.0.0.1:11434` automatically. If the connection is refused, start Ollama on +> the host (`ollama serve`) or point `OPENAI_BASE_URL` at the host IP instead of +> `127.0.0.1`. + +Reasoning ("thinking") models such as Qwen3 are supported. Ollama streams the +chain-of-thought in a `reasoning` field with an empty `content`; claw surfaces it +as a separate thinking block so both the reasoning trace and the final answer are +rendered. + ## Mock parity harness The workspace now includes a deterministic Anthropic-compatible mock service and a clean-environment CLI harness for end-to-end parity checks. From 067c91d31b3cbdc96657be1d828d9021e4b7a1ec Mon Sep 17 00:00:00 2001 From: "Heo, Sung" Date: Sun, 7 Jun 2026 20:14:33 +0900 Subject: [PATCH 4/4] docs: clarify Ollama provider routing (prefix vs. heuristic) The local-Ollama section described provider routing as only triggering when OPENAI_BASE_URL is set and the model name contains a `:` or `.`, but that heuristic only applies to prefix-less model names. The example uses the explicit `openai/` prefix, which selects the OpenAI-compatible provider directly (and is stripped before the tag reaches Ollama). Split the explanation into the two routing paths so the colon tag is correctly described as an Ollama requirement, not the routing trigger. Co-Authored-By: Claude Opus 4.8 --- rust/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rust/README.md b/rust/README.md index 9d6d95d2bc..6010670d94 100644 --- a/rust/README.md +++ b/rust/README.md @@ -63,9 +63,12 @@ export OPENAI_API_KEY="ollama" # any non-empty value works; Ollama ignores it **The model name must match the Ollama tag exactly.** Ollama tags use a colon (`name:tag`), e.g. `qwen3.5:9b` — not a hyphen (`qwen3.5-9b`), which the server -rejects with `404 ... model not found`. Routing to the OpenAI-compatible -provider only kicks in when `OPENAI_BASE_URL` is set and the model name contains -a `:` or `.`, so always pass the full tag. List the installed tags with: +rejects with `404 ... model not found`. In the command above the `openai/` +prefix is what selects the OpenAI-compatible provider (the prefix is stripped +before the tag is sent to Ollama). You can also drop the prefix and pass the +bare tag — when `OPENAI_BASE_URL` is set, claw routes a prefix-less model to the +OpenAI-compatible endpoint as long as the name contains a `:` or `.`. Either +way, always pass the full colon tag. List the installed tags with: ```bash ollama list # native view