diff --git a/rust/README.md b/rust/README.md index edcd4fefc1..6010670d94 100644 --- a/rust/README.md +++ b/rust/README.md @@ -40,6 +40,51 @@ Or provide an OAuth bearer token directly: export ANTHROPIC_AUTH_TOKEN="anthropic-oauth-or-proxy-bearer-token" ``` +### Local models via Ollama (WSL) + +Claw works with any OpenAI-compatible endpoint, including a local +[Ollama](https://ollama.com) server. This is handy for offline use or for +driving the CLI from WSL against models served on the Windows host. + +```bash +cd rust/ + +# 1. Build the release binary +cargo build --release + +# 2. Point claw at the local Ollama OpenAI-compatible endpoint +export OPENAI_BASE_URL="http://127.0.0.1:11434/v1" +export OPENAI_API_KEY="ollama" # any non-empty value works; Ollama ignores it + +# 3. Run. Prefix the model with `openai/` so prefix routing selects the +# OpenAI-compatible provider, and use the exact Ollama tag. +./target/release/claw --model openai/qwen3.5:9b +``` + +**The model name must match the Ollama tag exactly.** Ollama tags use a colon +(`name:tag`), e.g. `qwen3.5:9b` — not a hyphen (`qwen3.5-9b`), which the server +rejects with `404 ... model not found`. In the command above the `openai/` +prefix is what selects the OpenAI-compatible provider (the prefix is stripped +before the tag is sent to Ollama). You can also drop the prefix and pass the +bare tag — when `OPENAI_BASE_URL` is set, claw routes a prefix-less model to the +OpenAI-compatible endpoint as long as the name contains a `:` or `.`. Either +way, always pass the full colon tag. List the installed tags with: + +```bash +ollama list # native view +curl -s http://127.0.0.1:11434/v1/models # OpenAI-compatible view +``` + +> **WSL note:** when Ollama runs on the Windows host, WSL 2 forwards +> `127.0.0.1:11434` automatically. If the connection is refused, start Ollama on +> the host (`ollama serve`) or point `OPENAI_BASE_URL` at the host IP instead of +> `127.0.0.1`. + +Reasoning ("thinking") models such as Qwen3 are supported. Ollama streams the +chain-of-thought in a `reasoning` field with an empty `content`; claw surfaces it +as a separate thinking block so both the reasoning trace and the final answer are +rendered. + ## Mock parity harness The workspace now includes a deterministic Anthropic-compatible mock service and a clean-environment CLI harness for end-to-end parity checks. diff --git a/rust/crates/api/src/providers/openai_compat.rs b/rust/crates/api/src/providers/openai_compat.rs index 7f82d00ceb..9788742447 100644 --- a/rust/crates/api/src/providers/openai_compat.rs +++ b/rust/crates/api/src/providers/openai_compat.rs @@ -562,6 +562,7 @@ impl StreamState { .delta .reasoning_content .filter(|value| !value.is_empty()) + .or(choice.delta.reasoning.filter(|value| !value.is_empty())) .or(choice .delta .thinking @@ -816,6 +817,9 @@ struct ChatMessage { content: Option, #[serde(default)] reasoning_content: Option, + /// Ollama's Qwen3 thinking models stream chain-of-thought here with `content=""`. + #[serde(default)] + reasoning: Option, #[serde(default)] tool_calls: Vec, } @@ -890,6 +894,9 @@ struct ChunkDelta { /// Some providers (GLM, DeepSeek) emit reasoning in `reasoning_content` #[serde(default)] reasoning_content: Option, + /// Ollama's Qwen3 thinking models stream tokens here with `content=""`. + #[serde(default)] + reasoning: Option, #[serde(default)] thinking: Option, #[serde(default, deserialize_with = "deserialize_null_as_empty_vec")] @@ -1500,6 +1507,7 @@ fn normalize_response( .message .reasoning_content .filter(|value| !value.is_empty()) + .or_else(|| choice.message.reasoning.filter(|value| !value.is_empty())) { content.push(OutputContentBlock::Thinking { thinking, @@ -1982,6 +1990,7 @@ mod tests { role: "assistant".to_string(), content: Some("final answer".to_string()), reasoning_content: Some("hidden thought".to_string()), + reasoning: None, tool_calls: Vec::new(), }, finish_reason: Some("stop".to_string()), @@ -2007,6 +2016,112 @@ mod tests { ); } + #[test] + fn non_streaming_response_with_ollama_reasoning_field_emits_thinking_block() { + // Given an Ollama-style response that carries the chain-of-thought in + // `reasoning` (not `reasoning_content`) — Qwen3 thinking models emit an + // empty `content` and put the trace in `reasoning`. + let response = super::ChatCompletionResponse { + id: "chatcmpl_ollama".to_string(), + model: "qwen3.5:9b".to_string(), + choices: vec![super::ChatChoice { + message: super::ChatMessage { + role: "assistant".to_string(), + content: Some("43".to_string()), + reasoning_content: None, + reasoning: Some("17 + 26 = 43".to_string()), + tool_calls: Vec::new(), + }, + finish_reason: Some("stop".to_string()), + }], + usage: None, + }; + + // When normalizing the provider response. + let normalized = normalize_response("qwen3.5:9b", response).expect("normalized"); + + // Then the `reasoning` text surfaces as a Thinking block before the text. + assert_eq!( + normalized.content, + vec![ + OutputContentBlock::Thinking { + thinking: "17 + 26 = 43".to_string(), + signature: None, + }, + OutputContentBlock::Text { + text: "43".to_string(), + }, + ] + ); + } + + #[test] + fn streaming_chunks_with_ollama_reasoning_field_emit_thinking_before_text() { + // Given Ollama Qwen3 streaming chunks carrying the chain-of-thought in + // the `reasoning` field with an empty `content`. + let mut state = StreamState::new("qwen3.5:9b".to_string()); + let mut events = state + .ingest_chunk(super::ChatCompletionChunk { + id: "chatcmpl_ollama_stream".to_string(), + model: Some("qwen3.5:9b".to_string()), + choices: vec![super::ChunkChoice { + delta: super::ChunkDelta { + content: None, + reasoning_content: None, + reasoning: Some("thinking".to_string()), + thinking: None, + tool_calls: Vec::new(), + }, + finish_reason: None, + }], + usage: None, + }) + .expect("reasoning chunk"); + events.extend( + state + .ingest_chunk(super::ChatCompletionChunk { + id: "chatcmpl_ollama_stream".to_string(), + model: None, + choices: vec![super::ChunkChoice { + delta: super::ChunkDelta { + content: Some("43".to_string()), + reasoning_content: None, + reasoning: None, + thinking: None, + tool_calls: Vec::new(), + }, + finish_reason: Some("stop".to_string()), + }], + usage: None, + }) + .expect("text chunk"), + ); + events.extend(state.finish().expect("finish")); + + // Then a Thinking block (index 0) is emitted before the Text block (index 1). + assert!(matches!( + events[1], + StreamEvent::ContentBlockStart(ContentBlockStartEvent { + index: 0, + content_block: OutputContentBlock::Thinking { .. }, + }) + )); + assert!(events.iter().any(|event| matches!( + event, + StreamEvent::ContentBlockDelta(ContentBlockDeltaEvent { + index: 0, + delta: ContentBlockDelta::ThinkingDelta { .. }, + }) + ))); + assert!(events.iter().any(|event| matches!( + event, + StreamEvent::ContentBlockStart(ContentBlockStartEvent { + index: 1, + content_block: OutputContentBlock::Text { .. }, + }) + ))); + } + #[test] fn streaming_chunks_with_reasoning_content_emit_thinking_block_events_before_text() { // Given streaming chunks with reasoning_content followed by text. @@ -2019,6 +2134,7 @@ mod tests { delta: super::ChunkDelta { content: None, reasoning_content: Some("think".to_string()), + reasoning: None, thinking: None, tool_calls: Vec::new(), }, @@ -2036,6 +2152,7 @@ mod tests { delta: super::ChunkDelta { content: Some(" answer".to_string()), reasoning_content: None, + reasoning: None, thinking: None, tool_calls: Vec::new(), }, diff --git a/rust/crates/rusty-claude-cli/src/main.rs b/rust/crates/rusty-claude-cli/src/main.rs index 38974eb5f3..75765382fb 100644 --- a/rust/crates/rusty-claude-cli/src/main.rs +++ b/rust/crates/rusty-claude-cli/src/main.rs @@ -2943,6 +2943,11 @@ fn validate_model_syntax(model: &str) -> Result<(), String> { err_msg.push_str("\nDid you mean `qwen/"); err_msg.push_str(trimmed); err_msg.push_str("`? (Requires DASHSCOPE_API_KEY env var)"); + err_msg.push_str("\nOr for a local Ollama server: use `openai/"); + err_msg.push_str(trimmed); + err_msg.push_str( + "` with OPENAI_BASE_URL=http://127.0.0.1:11434/v1 and OPENAI_API_KEY=ollama", + ); } else if trimmed.starts_with("grok") { err_msg.push_str("\nDid you mean `xai/"); err_msg.push_str(trimmed);