From d1b9e4598cb8a7291116268740c7c1f15ac91d77 Mon Sep 17 00:00:00 2001
From: root <root@user.localdomain>
Date: Fri, 24 Apr 2026 11:12:04 +0900
Subject: [PATCH 1/4] fix: extend invalid-model hint for qwen prefix to mention
 local Ollama path

When users type bare qwen model names (e.g., qwen3.5:9b), the validator
already suggested qwen/ prefix for DashScope. Now it also documents the
local Ollama path: use openai/ prefix with OPENAI_BASE_URL and
OPENAI_API_KEY=ollama.

This addresses the confusion observed when debugging Ollama integration
after the upstream merge, where users might follow the DashScope hint
but actually want to run models locally via Ollama.
---
 rust/crates/rusty-claude-cli/src/main.rs | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/rust/crates/rusty-claude-cli/src/main.rs b/rust/crates/rusty-claude-cli/src/main.rs
index 38974eb5f3..75765382fb 100644
--- a/rust/crates/rusty-claude-cli/src/main.rs
+++ b/rust/crates/rusty-claude-cli/src/main.rs
@@ -2943,6 +2943,11 @@ fn validate_model_syntax(model: &str) -> Result<(), String> {
             err_msg.push_str("\nDid you mean `qwen/");
             err_msg.push_str(trimmed);
             err_msg.push_str("`? (Requires DASHSCOPE_API_KEY env var)");
+            err_msg.push_str("\nOr for a local Ollama server: use `openai/");
+            err_msg.push_str(trimmed);
+            err_msg.push_str(
+                "` with OPENAI_BASE_URL=http://127.0.0.1:11434/v1 and OPENAI_API_KEY=ollama",
+            );
         } else if trimmed.starts_with("grok") {
             err_msg.push_str("\nDid you mean `xai/");
             err_msg.push_str(trimmed);

From 5d67e76788294f676d566c8f90243cbe83e0d649 Mon Sep 17 00:00:00 2001
From: "Heo, Sung" <heosung@gmail.com>
Date: Sat, 6 Jun 2026 08:09:18 +0900
Subject: [PATCH 2/4] fix(api): surface Ollama `reasoning` field for Qwen3
 thinking models

Ollama's Qwen3 thinking models stream the chain-of-thought in a `reasoning`
field with an empty `content`. The OpenAI-compatible provider only recognised
`reasoning_content`/`thinking`, so reasoning-only responses surfaced no text
at all.

Add `reasoning` to both the streaming (`ChunkDelta`) and non-streaming
(`ChatMessage`) shapes and fall back to it when extracting thinking, preserving
the existing thinking/text content-block separation. Covered by two new tests
for the streaming and non-streaming paths.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../crates/api/src/providers/openai_compat.rs | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/rust/crates/api/src/providers/openai_compat.rs b/rust/crates/api/src/providers/openai_compat.rs
index 7f82d00ceb..9788742447 100644
--- a/rust/crates/api/src/providers/openai_compat.rs
+++ b/rust/crates/api/src/providers/openai_compat.rs
@@ -562,6 +562,7 @@ impl StreamState {
                 .delta
                 .reasoning_content
                 .filter(|value| !value.is_empty())
+                .or(choice.delta.reasoning.filter(|value| !value.is_empty()))
                 .or(choice
                     .delta
                     .thinking
@@ -816,6 +817,9 @@ struct ChatMessage {
     content: Option<String>,
     #[serde(default)]
     reasoning_content: Option<String>,
+    /// Ollama's Qwen3 thinking models stream chain-of-thought here with `content=""`.
+    #[serde(default)]
+    reasoning: Option<String>,
     #[serde(default)]
     tool_calls: Vec<ResponseToolCall>,
 }
@@ -890,6 +894,9 @@ struct ChunkDelta {
     /// Some providers (GLM, DeepSeek) emit reasoning in `reasoning_content`
     #[serde(default)]
     reasoning_content: Option<String>,
+    /// Ollama's Qwen3 thinking models stream tokens here with `content=""`.
+    #[serde(default)]
+    reasoning: Option<String>,
     #[serde(default)]
     thinking: Option<ThinkingDelta>,
     #[serde(default, deserialize_with = "deserialize_null_as_empty_vec")]
@@ -1500,6 +1507,7 @@ fn normalize_response(
         .message
         .reasoning_content
         .filter(|value| !value.is_empty())
+        .or_else(|| choice.message.reasoning.filter(|value| !value.is_empty()))
     {
         content.push(OutputContentBlock::Thinking {
             thinking,
@@ -1982,6 +1990,7 @@ mod tests {
                     role: "assistant".to_string(),
                     content: Some("final answer".to_string()),
                     reasoning_content: Some("hidden thought".to_string()),
+                    reasoning: None,
                     tool_calls: Vec::new(),
                 },
                 finish_reason: Some("stop".to_string()),
@@ -2007,6 +2016,112 @@ mod tests {
         );
     }
 
+    #[test]
+    fn non_streaming_response_with_ollama_reasoning_field_emits_thinking_block() {
+        // Given an Ollama-style response that carries the chain-of-thought in
+        // `reasoning` (not `reasoning_content`) — Qwen3 thinking models emit an
+        // empty `content` and put the trace in `reasoning`.
+        let response = super::ChatCompletionResponse {
+            id: "chatcmpl_ollama".to_string(),
+            model: "qwen3.5:9b".to_string(),
+            choices: vec![super::ChatChoice {
+                message: super::ChatMessage {
+                    role: "assistant".to_string(),
+                    content: Some("43".to_string()),
+                    reasoning_content: None,
+                    reasoning: Some("17 + 26 = 43".to_string()),
+                    tool_calls: Vec::new(),
+                },
+                finish_reason: Some("stop".to_string()),
+            }],
+            usage: None,
+        };
+
+        // When normalizing the provider response.
+        let normalized = normalize_response("qwen3.5:9b", response).expect("normalized");
+
+        // Then the `reasoning` text surfaces as a Thinking block before the text.
+        assert_eq!(
+            normalized.content,
+            vec![
+                OutputContentBlock::Thinking {
+                    thinking: "17 + 26 = 43".to_string(),
+                    signature: None,
+                },
+                OutputContentBlock::Text {
+                    text: "43".to_string(),
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn streaming_chunks_with_ollama_reasoning_field_emit_thinking_before_text() {
+        // Given Ollama Qwen3 streaming chunks carrying the chain-of-thought in
+        // the `reasoning` field with an empty `content`.
+        let mut state = StreamState::new("qwen3.5:9b".to_string());
+        let mut events = state
+            .ingest_chunk(super::ChatCompletionChunk {
+                id: "chatcmpl_ollama_stream".to_string(),
+                model: Some("qwen3.5:9b".to_string()),
+                choices: vec![super::ChunkChoice {
+                    delta: super::ChunkDelta {
+                        content: None,
+                        reasoning_content: None,
+                        reasoning: Some("thinking".to_string()),
+                        thinking: None,
+                        tool_calls: Vec::new(),
+                    },
+                    finish_reason: None,
+                }],
+                usage: None,
+            })
+            .expect("reasoning chunk");
+        events.extend(
+            state
+                .ingest_chunk(super::ChatCompletionChunk {
+                    id: "chatcmpl_ollama_stream".to_string(),
+                    model: None,
+                    choices: vec![super::ChunkChoice {
+                        delta: super::ChunkDelta {
+                            content: Some("43".to_string()),
+                            reasoning_content: None,
+                            reasoning: None,
+                            thinking: None,
+                            tool_calls: Vec::new(),
+                        },
+                        finish_reason: Some("stop".to_string()),
+                    }],
+                    usage: None,
+                })
+                .expect("text chunk"),
+        );
+        events.extend(state.finish().expect("finish"));
+
+        // Then a Thinking block (index 0) is emitted before the Text block (index 1).
+        assert!(matches!(
+            events[1],
+            StreamEvent::ContentBlockStart(ContentBlockStartEvent {
+                index: 0,
+                content_block: OutputContentBlock::Thinking { .. },
+            })
+        ));
+        assert!(events.iter().any(|event| matches!(
+            event,
+            StreamEvent::ContentBlockDelta(ContentBlockDeltaEvent {
+                index: 0,
+                delta: ContentBlockDelta::ThinkingDelta { .. },
+            })
+        )));
+        assert!(events.iter().any(|event| matches!(
+            event,
+            StreamEvent::ContentBlockStart(ContentBlockStartEvent {
+                index: 1,
+                content_block: OutputContentBlock::Text { .. },
+            })
+        )));
+    }
+
     #[test]
     fn streaming_chunks_with_reasoning_content_emit_thinking_block_events_before_text() {
         // Given streaming chunks with reasoning_content followed by text.
@@ -2019,6 +2134,7 @@ mod tests {
                     delta: super::ChunkDelta {
                         content: None,
                         reasoning_content: Some("think".to_string()),
+                        reasoning: None,
                         thinking: None,
                         tool_calls: Vec::new(),
                     },
@@ -2036,6 +2152,7 @@ mod tests {
                         delta: super::ChunkDelta {
                             content: Some(" answer".to_string()),
                             reasoning_content: None,
+                            reasoning: None,
                             thinking: None,
                             tool_calls: Vec::new(),
                         },

From 65e6e9795b644d54cdb30f9506110ce647bde9dd Mon Sep 17 00:00:00 2001
From: "Heo, Sung" <heosung@gmail.com>
Date: Sat, 6 Jun 2026 08:09:18 +0900
Subject: [PATCH 3/4] docs(readme): add WSL + Ollama local-model setup guide

Document running claw against a local Ollama server over its OpenAI-compatible
endpoint: build, env vars (OPENAI_BASE_URL / OPENAI_API_KEY), the exact
colon-tag model-name requirement (`qwen3.5:9b`, not `qwen3.5-9b`), how to list
installed tags, a WSL connectivity note, and reasoning-model behaviour.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 rust/README.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/rust/README.md b/rust/README.md
index edcd4fefc1..9d6d95d2bc 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -40,6 +40,48 @@ Or provide an OAuth bearer token directly:
 export ANTHROPIC_AUTH_TOKEN="anthropic-oauth-or-proxy-bearer-token"
 ```
 
+### Local models via Ollama (WSL)
+
+Claw works with any OpenAI-compatible endpoint, including a local
+[Ollama](https://ollama.com) server. This is handy for offline use or for
+driving the CLI from WSL against models served on the Windows host.
+
+```bash
+cd rust/
+
+# 1. Build the release binary
+cargo build --release
+
+# 2. Point claw at the local Ollama OpenAI-compatible endpoint
+export OPENAI_BASE_URL="http://127.0.0.1:11434/v1"
+export OPENAI_API_KEY="ollama"   # any non-empty value works; Ollama ignores it
+
+# 3. Run. Prefix the model with `openai/` so prefix routing selects the
+#    OpenAI-compatible provider, and use the exact Ollama tag.
+./target/release/claw --model openai/qwen3.5:9b
+```
+
+**The model name must match the Ollama tag exactly.** Ollama tags use a colon
+(`name:tag`), e.g. `qwen3.5:9b` — not a hyphen (`qwen3.5-9b`), which the server
+rejects with `404 ... model not found`. Routing to the OpenAI-compatible
+provider only kicks in when `OPENAI_BASE_URL` is set and the model name contains
+a `:` or `.`, so always pass the full tag. List the installed tags with:
+
+```bash
+ollama list                                 # native view
+curl -s http://127.0.0.1:11434/v1/models    # OpenAI-compatible view
+```
+
+> **WSL note:** when Ollama runs on the Windows host, WSL 2 forwards
+> `127.0.0.1:11434` automatically. If the connection is refused, start Ollama on
+> the host (`ollama serve`) or point `OPENAI_BASE_URL` at the host IP instead of
+> `127.0.0.1`.
+
+Reasoning ("thinking") models such as Qwen3 are supported. Ollama streams the
+chain-of-thought in a `reasoning` field with an empty `content`; claw surfaces it
+as a separate thinking block so both the reasoning trace and the final answer are
+rendered.
+
 ## Mock parity harness
 
 The workspace now includes a deterministic Anthropic-compatible mock service and a clean-environment CLI harness for end-to-end parity checks.

From 067c91d31b3cbdc96657be1d828d9021e4b7a1ec Mon Sep 17 00:00:00 2001
From: "Heo, Sung" <heosung@gmail.com>
Date: Sun, 7 Jun 2026 20:14:33 +0900
Subject: [PATCH 4/4] docs: clarify Ollama provider routing (prefix vs.
 heuristic)

The local-Ollama section described provider routing as only triggering
when OPENAI_BASE_URL is set and the model name contains a `:` or `.`,
but that heuristic only applies to prefix-less model names. The example
uses the explicit `openai/` prefix, which selects the OpenAI-compatible
provider directly (and is stripped before the tag reaches Ollama).

Split the explanation into the two routing paths so the colon tag is
correctly described as an Ollama requirement, not the routing trigger.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 rust/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rust/README.md b/rust/README.md
index 9d6d95d2bc..6010670d94 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -63,9 +63,12 @@ export OPENAI_API_KEY="ollama"   # any non-empty value works; Ollama ignores it
 
 **The model name must match the Ollama tag exactly.** Ollama tags use a colon
 (`name:tag`), e.g. `qwen3.5:9b` — not a hyphen (`qwen3.5-9b`), which the server
-rejects with `404 ... model not found`. Routing to the OpenAI-compatible
-provider only kicks in when `OPENAI_BASE_URL` is set and the model name contains
-a `:` or `.`, so always pass the full tag. List the installed tags with:
+rejects with `404 ... model not found`. In the command above the `openai/`
+prefix is what selects the OpenAI-compatible provider (the prefix is stripped
+before the tag is sent to Ollama). You can also drop the prefix and pass the
+bare tag — when `OPENAI_BASE_URL` is set, claw routes a prefix-less model to the
+OpenAI-compatible endpoint as long as the name contains a `:` or `.`. Either
+way, always pass the full colon tag. List the installed tags with:
 
 ```bash
 ollama list                                 # native view