From e1b12f8120be9056e0b16a0a006e795908a2b972 Mon Sep 17 00:00:00 2001 From: Vinicius Carvalho Date: Sun, 28 Jun 2026 22:08:07 -0300 Subject: [PATCH 1/3] feat: fall back to OpenCode when Anthropic is unavailable --- CONFIGURATION.md | 67 +++-- MODELS.md | 6 +- README.md | 14 +- cmd/routatic-proxy/main.go | 99 +++++--- cmd/routatic-proxy/main_test.go | 24 ++ configs/config.example.json | 80 +++--- internal/config/config.go | 7 + internal/config/loader.go | 11 + internal/config/loader_test.go | 22 ++ internal/config/model_registry.go | 38 +-- internal/handlers/anthropic_first.go | 287 ++++++++++++++++++++++ internal/handlers/anthropic_first_test.go | 228 +++++++++++++++++ internal/handlers/messages.go | 9 + internal/handlers/messages_test.go | 81 ++++++ internal/provider/opencode_zen.go | 2 + internal/router/fallback.go | 29 ++- internal/router/fallback_test.go | 43 ++++ internal/server/server.go | 2 +- 18 files changed, 946 insertions(+), 103 deletions(-) create mode 100644 cmd/routatic-proxy/main_test.go create mode 100644 internal/handlers/anthropic_first.go create mode 100644 internal/handlers/anthropic_first_test.go diff --git a/CONFIGURATION.md b/CONFIGURATION.md index f58fe97..6f96c33 100644 --- a/CONFIGURATION.md +++ b/CONFIGURATION.md @@ -16,42 +16,50 @@ For migration, `~/.config/oc-go-cc/config.json` is loaded when the new config fi "host": "127.0.0.1", "port": 3456, "hot_reload": false, + "anthropic_first": { + "enabled": false, + "base_url": "https://api.anthropic.com" + }, "models": { "default": { "provider": "opencode-go", - "model_id": "kimi-k2.6", + "model_id": "deepseek-v4-pro", "temperature": 0.7, - "max_tokens": 4096 + "max_tokens": 8192, + "reasoning_effort": "max", + "thinking": { "type": "enabled" } }, "background": { "provider": "opencode-go", - "model_id": "qwen3.5-plus", + "model_id": "deepseek-v4-flash", "temperature": 0.5, "max_tokens": 2048 }, "think": { "provider": "opencode-go", - "model_id": "glm-5.1", + "model_id": "glm-5.2", "temperature": 0.7, "max_tokens": 8192 }, "complex": { "provider": "opencode-go", - "model_id": "glm-5.1", + "model_id": "deepseek-v4-pro", "temperature": 0.7, - "max_tokens": 4096 + "max_tokens": 8192, + "reasoning_effort": "max", + "thinking": { "type": "enabled" } }, "long_context": { "provider": "opencode-go", - "model_id": "minimax-m2.7", + "model_id": "minimax-m3", "temperature": 0.7, "max_tokens": 16384, "context_threshold": 80000 }, "fast": { "provider": "opencode-go", - "model_id": "qwen3.6-plus", + "model_id": "deepseek-v4-flash", "temperature": 0.7, "max_tokens": 4096 } @@ -59,13 +67,16 @@ For migration, `~/.config/oc-go-cc/config.json` is loaded when the new config fi "fallbacks": { "default": [ - { "provider": "opencode-go", "model_id": "glm-5" }, - { "provider": "opencode-go", "model_id": "qwen3.6-plus" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], - "think": [{ "provider": "opencode-go", "model_id": "glm-5" }], - "complex": [{ "provider": "opencode-go", "model_id": "glm-5" }], - "long_context": [{ "provider": "opencode-go", "model_id": "minimax-m2.5" }], - "fast": [{ "provider": "opencode-go", "model_id": "qwen3.5-plus" }] + "think": [{ "provider": "opencode-go", "model_id": "qwen3.7-plus" }], + "complex": [{ "provider": "opencode-go", "model_id": "qwen3.7-plus" }], + "long_context": [{ "provider": "opencode-go", "model_id": "qwen3.7-plus" }], + "fast": [{ "provider": "opencode-go", "model_id": "qwen3.7-plus" }] }, "model_overrides": { @@ -115,6 +126,34 @@ For migration, `~/.config/oc-go-cc/config.json` is loaded when the new config fi } ``` +## Anthropic-First Failover + +Enable this mode to keep Anthropic as Claude Code's primary API and use the configured OpenCode model chain only while Anthropic is unavailable: + +```json +{ + "anthropic_first": { + "enabled": true, + "base_url": "https://api.anthropic.com" + } +} +``` + +Configure Claude Code with only the proxy address: + +```bash +export ANTHROPIC_BASE_URL=http://127.0.0.1:3456 +unset ANTHROPIC_AUTH_TOKEN ANTHROPIC_API_KEY +``` + +Leaving the credential variables unset preserves the saved Claude Pro, Max, Team, or Enterprise login. The proxy forwards the raw request, OAuth credential, `anthropic-version`, and complete `anthropic-beta` capability header to Anthropic. + +Fallback occurs for HTTP 408, 429, 5xx, and transport failures before a response starts. HTTP 400, 401, 403, 404, and other request errors are returned unchanged. After a failure, the proxy honors `Retry-After`; otherwise it uses exponential backoff from 30 seconds to 15 minutes. One real user request probes recovery while concurrent requests continue through OpenCode. No synthetic health requests are sent. + +Once response bytes have started, a failed stream cannot be restarted on another model without duplicating content. `/v1/messages/count_tokens` remains local and does not affect availability state. + +When OpenCode Go returns `GoUsageLimitError`, remaining Go models are skipped for that request and the chain advances to Zen. The default chain uses Qwen3.7 Plus, Qwen3.7 Max, then the currently working Zen-free Nemotron 3 Ultra, MiMo V2.5, and DeepSeek V4 Flash models. Free Zen endpoints are time-limited and may retain data under [OpenCode's documented privacy terms](https://opencode.ai/docs/zen/#privacy). + ## Providers routatic-proxy supports three providers for upstream API calls: diff --git a/MODELS.md b/MODELS.md index e68b601..ff2c4ad 100644 --- a/MODELS.md +++ b/MODELS.md @@ -93,7 +93,7 @@ All OpenCode Go models are also available on Zen. Zen additionally offers: - **Claude Models (Anthropic endpoint):** claude-fable-5, claude-opus-4-8, claude-opus-4-7, claude-opus-4-6, claude-opus-4-5, claude-opus-4-1, claude-sonnet-4-6, claude-sonnet-4-5, claude-sonnet-4, claude-haiku-4-5, claude-3-5-haiku - **GPT Models (Responses endpoint):** gpt-5.5, gpt-5.5-pro, gpt-5.4, gpt-5.4-pro, gpt-5.4-mini, gpt-5.4-nano, gpt-5.3-codex, gpt-5.3-codex-spark, gpt-5.2, gpt-5.2-codex, gpt-5.1, gpt-5.1-codex, gpt-5.1-codex-max, gpt-5.1-codex-mini, gpt-5, gpt-5-codex, gpt-5-nano - **Gemini Models (Gemini endpoint):** gemini-3.5-flash, gemini-3.1-pro, gemini-3-flash -- **Free Tier (chat completions):** deepseek-v4-pro, deepseek-v4-flash-free, grok-build-0.1, big-pickle, mimo-v2.5-free, north-mini-code-free, nemotron-3-ultra-free +- **Free Tier (chat completions):** deepseek-v4-flash-free, big-pickle, mimo-v2.5-free, north-mini-code-free, nemotron-3-ultra-free #### Deprecated Zen Models @@ -117,7 +117,7 @@ The following models are deprecated and will be removed: | Claude Haiku 3.5 | Feb 16, 2026 | Claude Haiku 4.5 | | Qwen3 Coder 480B | Feb 6, 2026 | Qwen3.7 Plus/Max | -DeepSeek V4 Pro and Flash are OpenAI-compatible on both Go and Zen providers. On Zen, DeepSeek V4 Pro is available as a free-tier model. routatic-proxy transforms Claude Code's Anthropic request into OpenAI Chat Completions format, including tools, tool results, thinking history, `reasoning_effort`, and `thinking`. +DeepSeek V4 Pro and Flash are OpenAI-compatible on both Go and Zen providers. DeepSeek V4 Flash Free is the free Zen variant. routatic-proxy transforms Claude Code's Anthropic request into OpenAI Chat Completions format, including tools, tool results, thinking history, `reasoning_effort`, and `thinking`. For Claude Code and OpenCode-style agent workflows, DeepSeek V4 supports max thinking mode with: @@ -568,7 +568,7 @@ Critical review → GLM-5.1 (rarely) 2. **Reserve GLM-5.1 for critical tasks only** — 880 req/$12 drains budget fast 3. **Use Qwen3.5 Plus for simple operations** — 10,200 req/$12 is unbeatable 4. **MiniMax M2.5 for long context** — 6,300 req/$12 with 1M context is amazing value -5. **Use Zen free-tier models** for non-critical tasks — deepseek-v4-pro, grok-build-0.1, big-pickle, and others cost $0 +5. **Use Zen free-tier models** for non-critical tasks — Nemotron 3 Ultra Free, MiMo V2.5 Free, DeepSeek V4 Flash Free, Big Pickle, and others cost $0 while their promotions remain active 6. **Monitor your usage** in the [OpenCode console](https://opencode.ai/auth) ## See Also diff --git a/README.md b/README.md index 917d274..b00709d 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ OpenCode Go gives you access to powerful open coding models for **$5/month** (th - **Model Routing** — Automatically routes to different models based on context (default, thinking, long context, background) - **Streaming Scenario Routing** — Configurable routing for streaming requests; enables proper scenario selection for Claude Code multi-agent and review workflows (see [CONFIGURATION.md](CONFIGURATION.md#streaming-scenario-routing)) - **Fallback Chains** — If a model fails, automatically tries the next one in your configured chain +- **Anthropic-First Failover** — Keep Claude on Anthropic and use OpenCode only during rate limits or outages - **Circuit Breaker** — Tracks model health and skips failing models to avoid latency spikes - **Real-time Streaming** — Full SSE streaming with live format transformation - **Tool Calling** — Proper Anthropic tool_use/tool_result <-> OpenAI/Gemini function calling translation @@ -74,7 +75,7 @@ Zen provides pay-as-you-go access to additional models: - **Claude Models**: Claude Fable 5, Claude Opus 4.8/4.6/4.5/4.1, Claude Sonnet 4 - **Gemini Models**: Gemini 3.5 Flash, Gemini 3.1 Pro, Gemini 3 Flash - **GPT Models**: GPT 5.5, GPT 5.4, GPT 5.3 Codex, and more -- **Free Tier**: DeepSeek V4 Pro, Grok Build 0.1, Big Pickle, and others +- **Free Tier**: Nemotron 3 Ultra Free, MiMo V2.5 Free, DeepSeek V4 Flash Free, and others See [MODELS.md](MODELS.md#opencodes-zen) for the full Zen model list. @@ -144,11 +145,22 @@ make docker-stop ### 4. Configure Claude Code +For the default OpenCode-only mode: + ```bash export ANTHROPIC_BASE_URL=http://127.0.0.1:3456 export ANTHROPIC_AUTH_TOKEN=unused ``` +For Anthropic-first mode, enable `anthropic_first` in the proxy config and set only the base URL. Do not set an API key or auth token: Claude Code will keep using its saved Claude subscription login. + +```bash +export ANTHROPIC_BASE_URL=http://127.0.0.1:3456 +unset ANTHROPIC_AUTH_TOKEN ANTHROPIC_API_KEY +``` + +Anthropic-first mode falls back on HTTP 408, 429, 5xx, and connection failures. It honors `Retry-After` and uses one real request to detect recovery, so it does not spend tokens on health checks. See [CONFIGURATION.md](CONFIGURATION.md#anthropic-first-failover). + ### 5. Run Claude Code ```bash diff --git a/cmd/routatic-proxy/main.go b/cmd/routatic-proxy/main.go index 02b47f5..0763f55 100644 --- a/cmd/routatic-proxy/main.go +++ b/cmd/routatic-proxy/main.go @@ -173,11 +173,20 @@ func serveCmd() *cobra.Command { fmt.Printf("Starting %s v%s\n", appName, version) fmt.Printf("Listening on %s:%d\n", cfg.Host, cfg.Port) - fmt.Printf("Forwarding to: %s\n", cfg.OpenCodeGo.BaseURL) + if cfg.AnthropicFirst.Enabled { + fmt.Printf("Forwarding to Anthropic first: %s\n", cfg.AnthropicFirst.BaseURL) + fmt.Printf("OpenCode fallback: %s\n", cfg.OpenCodeGo.BaseURL) + } else { + fmt.Printf("Forwarding to: %s\n", cfg.OpenCodeGo.BaseURL) + } fmt.Println() fmt.Println("Configure Claude Code with:") fmt.Printf(" export ANTHROPIC_BASE_URL=http://%s:%d\n", cfg.Host, cfg.Port) - fmt.Println(" export ANTHROPIC_AUTH_TOKEN=unused") + if cfg.AnthropicFirst.Enabled { + fmt.Println(" unset ANTHROPIC_AUTH_TOKEN ANTHROPIC_API_KEY") + } else { + fmt.Println(" export ANTHROPIC_AUTH_TOKEN=unused") + } fmt.Println() return srv.Start() @@ -336,7 +345,7 @@ func checkCmd() *cobra.Command { env[key] = value } } - conflicts += checkClaudeEnv("environment", env, expectedURL) + conflicts += checkClaudeEnv("environment", env, expectedURL, cfg.AnthropicFirst.Enabled) home, err := os.UserHomeDir() if err != nil { @@ -361,7 +370,7 @@ func checkCmd() *cobra.Command { fmt.Printf("%s: %v\n", path, err) continue } - conflicts += checkClaudeEnv(path, settings.Env, expectedURL) + conflicts += checkClaudeEnv(path, settings.Env, expectedURL, cfg.AnthropicFirst.Enabled) } } @@ -379,7 +388,7 @@ func checkCmd() *cobra.Command { // checkClaudeEnv checks a single environment map for conflicting Claude Code settings. // Returns the number of conflicts found. -func checkClaudeEnv(source string, env map[string]string, expectedURL string) int { +func checkClaudeEnv(source string, env map[string]string, expectedURL string, anthropicFirst bool) int { conflicts := 0 if value, ok := env["ANTHROPIC_BASE_URL"]; ok { normalized := strings.TrimRight(value, "/") @@ -393,11 +402,14 @@ func checkClaudeEnv(source string, env map[string]string, expectedURL string) in conflicts++ } if value, ok := env["ANTHROPIC_AUTH_TOKEN"]; ok { - if value != "unused" { + if anthropicFirst { + fmt.Printf("%s: ANTHROPIC_AUTH_TOKEN is set; unset it to keep the saved Claude subscription login active\n", source) + conflicts++ + } else if value != "unused" { fmt.Printf("%s: ANTHROPIC_AUTH_TOKEN is %q, expected \"unused\"\n", source, value) conflicts++ } - } else { + } else if !anthropicFirst { fmt.Printf("%s: ANTHROPIC_AUTH_TOKEN is not set (recommended: \"unused\")\n", source) } return conflicts @@ -433,7 +445,6 @@ func modelsCmd() *cobra.Command { fmt.Println() fmt.Println("Available OpenCode Zen models (free tier):") fmt.Println() - fmt.Println(" deepseek-v4-pro OpenAI-compatible") fmt.Println(" deepseek-v4-flash-free OpenAI-compatible") fmt.Println(" grok-build-0.1 OpenAI-compatible") fmt.Println(" big-pickle OpenAI-compatible") @@ -564,42 +575,50 @@ func getDefaultConfig() string { "port": 3456, "hot_reload": false, "enable_streaming_scenario_routing": false, - "respect_requested_model": true, + "respect_requested_model": false, + "anthropic_first": { + "enabled": false, + "base_url": "https://api.anthropic.com" + }, "models": { "background": { "provider": "opencode-go", - "model_id": "qwen3.5-plus", + "model_id": "deepseek-v4-flash", "temperature": 0.5, "max_tokens": 2048 }, "default": { "provider": "opencode-go", - "model_id": "kimi-k2.6", + "model_id": "deepseek-v4-pro", "temperature": 0.7, - "max_tokens": 4096 + "max_tokens": 8192, + "reasoning_effort": "max", + "thinking": { "type": "enabled" } }, "long_context": { "provider": "opencode-go", - "model_id": "minimax-m2.5", + "model_id": "minimax-m3", "temperature": 0.7, "max_tokens": 16384, "context_threshold": 80000 }, "think": { "provider": "opencode-go", - "model_id": "glm-5.1", + "model_id": "glm-5.2", "temperature": 0.7, "max_tokens": 8192 }, "complex": { "provider": "opencode-go", - "model_id": "glm-5.1", + "model_id": "deepseek-v4-pro", "temperature": 0.7, - "max_tokens": 4096 + "max_tokens": 8192, + "reasoning_effort": "max", + "thinking": { "type": "enabled" } }, "fast": { "provider": "opencode-go", - "model_id": "qwen3.6-plus", + "model_id": "deepseek-v4-flash", "temperature": 0.7, "max_tokens": 4096 }, @@ -630,28 +649,46 @@ func getDefaultConfig() string { }, "fallbacks": { "background": [ - { "provider": "opencode-go", "model_id": "qwen3.6-plus" }, - { "provider": "opencode-go", "model_id": "minimax-m2.5" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "default": [ - { "provider": "opencode-go", "model_id": "mimo-v2.5-pro" }, - { "provider": "opencode-go", "model_id": "qwen3.6-plus" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "long_context": [ - { "provider": "opencode-go", "model_id": "minimax-m2.7" }, - { "provider": "opencode-go", "model_id": "kimi-k2.6" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "think": [ - { "provider": "opencode-go", "model_id": "kimi-k2.6" }, - { "provider": "opencode-go", "model_id": "mimo-v2.5-pro" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "complex": [ - { "provider": "opencode-go", "model_id": "glm-5.1" }, - { "provider": "opencode-go", "model_id": "kimi-k2.6" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "fast": [ - { "provider": "opencode-go", "model_id": "qwen3.5-plus" }, - { "provider": "opencode-go", "model_id": "minimax-m2.5" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "glm-5.2": [ { "provider": "opencode-go", "model_id": "glm-5.1" }, @@ -775,7 +812,7 @@ func getDefaultConfig() string { "opencode_go": { "base_url": "https://opencode.ai/zen/go/v1/chat/completions", "anthropic_base_url": "https://opencode.ai/zen/go/v1/messages", - "api_key": "${ROUTATIC_PROXY_OPENCODE_GO_API_KEY}", + "api_key": "", "api_keys": [], "timeout_ms": 300000 }, @@ -784,7 +821,7 @@ func getDefaultConfig() string { "anthropic_base_url": "https://opencode.ai/zen/v1/messages", "responses_base_url": "https://opencode.ai/zen/v1/responses", "gemini_base_url": "https://opencode.ai/zen/v1/models", - "api_key": "${ROUTATIC_PROXY_OPENCODE_ZEN_API_KEY}", + "api_key": "", "api_keys": [], "timeout_ms": 300000 }, diff --git a/cmd/routatic-proxy/main_test.go b/cmd/routatic-proxy/main_test.go new file mode 100644 index 0000000..c7a8b1f --- /dev/null +++ b/cmd/routatic-proxy/main_test.go @@ -0,0 +1,24 @@ +package main + +import ( + "os" + "path/filepath" + "testing" + + "github.com/routatic/proxy/internal/config" +) + +func TestDefaultConfigValidWithGlobalAPIKey(t *testing.T) { + t.Setenv("ROUTATIC_PROXY_API_KEY", "test-key") + path := filepath.Join(t.TempDir(), "config.json") + if err := os.WriteFile(path, []byte(getDefaultConfig()), 0600); err != nil { + t.Fatal(err) + } + cfg, err := config.LoadFromPath(path) + if err != nil { + t.Fatalf("generated default config is invalid: %v", err) + } + if cfg.AnthropicFirst.BaseURL != "https://api.anthropic.com" { + t.Fatalf("AnthropicFirst=%+v", cfg.AnthropicFirst) + } +} diff --git a/configs/config.example.json b/configs/config.example.json index 9324ceb..421a36d 100644 --- a/configs/config.example.json +++ b/configs/config.example.json @@ -5,29 +5,32 @@ "hot_reload": false, "enable_streaming_scenario_routing": false, "respect_requested_model": false, + "anthropic_first": { + "enabled": false, + "base_url": "https://api.anthropic.com" + }, "models": { "background": { "provider": "opencode-go", - "model_id": "qwen3.5-plus", + "model_id": "deepseek-v4-flash", "temperature": 0.5, - "max_tokens": 2048, - "vision": true + "max_tokens": 2048 }, "default": { "provider": "opencode-go", - "model_id": "kimi-k2.6", + "model_id": "deepseek-v4-pro", "temperature": 0.7, - "max_tokens": 4096, - "vision": true + "max_tokens": 8192, + "reasoning_effort": "max", + "thinking": { "type": "enabled" } }, "long_context": { "provider": "opencode-go", - "model_id": "minimax-m2.5", + "model_id": "minimax-m3", "temperature": 0.7, "max_tokens": 16384, - "context_threshold": 80000, - "vision": true + "context_threshold": 80000 }, "deepseek-v4-pro": { "provider": "opencode-go", @@ -51,17 +54,17 @@ }, "think": { "provider": "opencode-go", - "model_id": "glm-5.1", + "model_id": "glm-5.2", "temperature": 0.7, - "max_tokens": 8192, - "vision": true + "max_tokens": 8192 }, "complex": { "provider": "opencode-go", - "model_id": "glm-5.1", + "model_id": "deepseek-v4-pro", "temperature": 0.7, - "max_tokens": 4096, - "vision": true + "max_tokens": 8192, + "reasoning_effort": "max", + "thinking": { "type": "enabled" } }, "glm-5.2": { "provider": "opencode-go", @@ -93,37 +96,54 @@ }, "fast": { "provider": "opencode-go", - "model_id": "qwen3.6-plus", + "model_id": "deepseek-v4-flash", "temperature": 0.7, - "max_tokens": 4096, - "vision": true + "max_tokens": 4096 } }, "fallbacks": { "background": [ - { "provider": "opencode-go", "model_id": "qwen3.6-plus" }, - { "provider": "opencode-go", "model_id": "minimax-m2.5" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "default": [ - { "provider": "opencode-go", "model_id": "mimo-v2.5-pro" }, - { "provider": "opencode-go", "model_id": "qwen3.6-plus" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "long_context": [ - { "provider": "opencode-go", "model_id": "minimax-m2.7" }, - { "provider": "opencode-go", "model_id": "kimi-k2.6" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "think": [ - { "provider": "opencode-go", "model_id": "kimi-k2.6" }, - { "provider": "opencode-go", "model_id": "mimo-v2.5-pro" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "complex": [ - { "provider": "opencode-go", "model_id": "glm-5" }, - { "provider": "opencode-go", "model_id": "kimi-k2.6" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "fast": [ - { "provider": "opencode-go", "model_id": "qwen3.5-plus" }, - { "provider": "opencode-go", "model_id": "minimax-m2.5" } + { "provider": "opencode-go", "model_id": "qwen3.7-plus" }, + { "provider": "opencode-go", "model_id": "qwen3.7-max" }, + { "provider": "opencode-zen", "model_id": "nemotron-3-ultra-free" }, + { "provider": "opencode-zen", "model_id": "mimo-v2.5-free" }, + { "provider": "opencode-zen", "model_id": "deepseek-v4-flash-free" } ], "glm-5.2": [ { "provider": "opencode-go", "model_id": "glm-5.1" }, diff --git a/internal/config/config.go b/internal/config/config.go index a10f47a..7d78b02 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -18,10 +18,17 @@ type Config struct { AWSBedrock AWSBedrockConfig `json:"aws_bedrock"` OpenCodeGo OpenCodeGoConfig `json:"opencode_go"` OpenCodeZen OpenCodeZenConfig `json:"opencode_zen"` + AnthropicFirst AnthropicFirstConfig `json:"anthropic_first"` Logging LoggingConfig `json:"logging"` Debug DebugConfig `json:"debug"` } +// AnthropicFirstConfig controls direct Anthropic passthrough with OpenCode fallback. +type AnthropicFirstConfig struct { + Enabled bool `json:"enabled"` + BaseURL string `json:"base_url"` +} + // DebugConfig holds debug-related configuration. type DebugConfig struct { CaptureEnabled bool `json:"capture_enabled"` diff --git a/internal/config/loader.go b/internal/config/loader.go index 0ad3cd1..6b1abed 100644 --- a/internal/config/loader.go +++ b/internal/config/loader.go @@ -3,6 +3,7 @@ package config import ( "encoding/json" "fmt" + "net/url" "os" "path/filepath" "regexp" @@ -19,6 +20,7 @@ const ( defaultAnthropicBaseURL = "https://opencode.ai/zen/go/v1/messages" defaultTimeoutMs = 300000 defaultLogLevel = "info" + defaultAnthropicAPIURL = "https://api.anthropic.com" defaultZenBaseURL = "https://opencode.ai/zen/v1/chat/completions" defaultZenAnthropicBaseURL = "https://opencode.ai/zen/v1/messages" @@ -230,6 +232,9 @@ func applyDefaults(cfg *Config) { if cfg.Port == 0 { cfg.Port = defaultPort } + if cfg.AnthropicFirst.BaseURL == "" { + cfg.AnthropicFirst.BaseURL = defaultAnthropicAPIURL + } if cfg.OpenCodeGo.BaseURL == "" { cfg.OpenCodeGo.BaseURL = defaultBaseURL } @@ -284,6 +289,12 @@ func validate(cfg *Config) error { if cfg.APIKey == "" && len(cfg.APIKeys) == 0 { return fmt.Errorf("api_key or api_keys is required (set via config file or ROUTATIC_PROXY_API_KEY env var; OC_GO_CC_API_KEY is still supported)") } + if cfg.AnthropicFirst.Enabled { + u, err := url.Parse(cfg.AnthropicFirst.BaseURL) + if err != nil || u.Host == "" || (u.Scheme != "http" && u.Scheme != "https") { + return fmt.Errorf("anthropic_first.base_url must be an absolute http or https URL") + } + } if err := validateAPIKeys(cfg.APIKeys); err != nil { return err diff --git a/internal/config/loader_test.go b/internal/config/loader_test.go index 2083874..c5acc97 100644 --- a/internal/config/loader_test.go +++ b/internal/config/loader_test.go @@ -64,6 +64,28 @@ func TestLoadJSON(t *testing.T) { } } +func TestAnthropicFirstDefaultsAndValidation(t *testing.T) { + dir := t.TempDir() + cfgPath := filepath.Join(dir, "config.json") + if err := os.WriteFile(cfgPath, []byte(`{"api_key":"test-key","anthropic_first":{"enabled":true}}`), 0600); err != nil { + t.Fatal(err) + } + cfg, err := LoadFromPath(cfgPath) + if err != nil { + t.Fatal(err) + } + if !cfg.AnthropicFirst.Enabled || cfg.AnthropicFirst.BaseURL != "https://api.anthropic.com" { + t.Fatalf("AnthropicFirst=%+v", cfg.AnthropicFirst) + } + + if err := os.WriteFile(cfgPath, []byte(`{"api_key":"test-key","anthropic_first":{"enabled":true,"base_url":"not-a-url"}}`), 0600); err != nil { + t.Fatal(err) + } + if _, err := LoadFromPath(cfgPath); err == nil { + t.Fatal("expected invalid anthropic_first.base_url to fail validation") + } +} + func TestLoadJSON_WithModelOverrides(t *testing.T) { dir := t.TempDir() cfgPath := filepath.Join(dir, "config.json") diff --git a/internal/config/model_registry.go b/internal/config/model_registry.go index 5bbefed..3d07e2b 100644 --- a/internal/config/model_registry.go +++ b/internal/config/model_registry.go @@ -10,24 +10,26 @@ type ModelMetadata struct { } var modelMetadata = map[string]ModelMetadata{ - "deepseek-v4-pro": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, - "deepseek-v4-flash": {ContextWindow: 1000000, MaxOutputTokens: 4096, Vision: false, SupportsTools: true}, - "glm-5.2": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, - "glm-5.1": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, - "glm-5": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, - "kimi-k2.7-code": {ContextWindow: 256000, MaxOutputTokens: 32768, Vision: true, SupportsTools: true}, - "kimi-k2.6": {ContextWindow: 256000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, - "kimi-k2.5": {ContextWindow: 256000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, - "mimo-v2-omni": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, - "mimo-v2.5-pro": {ContextWindow: 1000000, MaxOutputTokens: 16384, Vision: false, SupportsTools: true}, - "mimo-v2.5": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, - "minimax-m3": {ContextWindow: 1000000, MaxOutputTokens: 128000, Vision: false, SupportsTools: true}, - "minimax-m2.7": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, - "minimax-m2.5": {ContextWindow: 200000, MaxOutputTokens: 4096, Vision: false, SupportsTools: true}, - "qwen3.7-max": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, - "qwen3.7-plus": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, - "qwen3.6-plus": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, - "qwen3.5-plus": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "deepseek-v4-pro": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "deepseek-v4-flash": {ContextWindow: 1000000, MaxOutputTokens: 4096, Vision: false, SupportsTools: true}, + "deepseek-v4-flash-free": {ContextWindow: 1000000, MaxOutputTokens: 4096, Vision: false, SupportsTools: true}, + "glm-5.2": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "glm-5.1": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "glm-5": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "kimi-k2.7-code": {ContextWindow: 256000, MaxOutputTokens: 32768, Vision: true, SupportsTools: true}, + "kimi-k2.6": {ContextWindow: 256000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "kimi-k2.5": {ContextWindow: 256000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "mimo-v2-omni": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "mimo-v2.5-pro": {ContextWindow: 1000000, MaxOutputTokens: 16384, Vision: false, SupportsTools: true}, + "mimo-v2.5": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "minimax-m3": {ContextWindow: 1000000, MaxOutputTokens: 128000, Vision: false, SupportsTools: true}, + "minimax-m2.7": {ContextWindow: 200000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "minimax-m2.5": {ContextWindow: 200000, MaxOutputTokens: 4096, Vision: false, SupportsTools: true}, + "qwen3.7-max": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "qwen3.7-plus": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "qwen3.6-plus": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, + "mimo-v2.5-free": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: false, SupportsTools: true}, + "qwen3.5-plus": {ContextWindow: 1000000, MaxOutputTokens: 8192, Vision: true, SupportsTools: true}, } func ResolveModelConfig(model ModelConfig) ModelConfig { diff --git a/internal/handlers/anthropic_first.go b/internal/handlers/anthropic_first.go new file mode 100644 index 0000000..c774702 --- /dev/null +++ b/internal/handlers/anthropic_first.go @@ -0,0 +1,287 @@ +package handlers + +import ( + "bytes" + "encoding/json" + "io" + "log/slog" + "net" + "net/http" + "net/url" + "strconv" + "strings" + "sync" + "time" + + "github.com/routatic/proxy/internal/config" +) + +const ( + availabilityBackoffStart = 30 * time.Second + availabilityBackoffMax = 15 * time.Minute + maxAnthropicBodySize = 100 << 20 +) + +// AnthropicFirstHandler sends inference to Anthropic and uses the existing +// OpenCode handler only while Anthropic is unavailable. +type AnthropicFirstHandler struct { + atomic *config.AtomicConfig + fallback http.Handler + client *http.Client + logger *slog.Logger + gate availabilityGate +} + +type availabilityGate struct { + mu sync.Mutex + baseURL string + unavailable bool + probing bool + failures int + nextProbe time.Time +} + +type availabilityAttempt struct { + baseURL string + probe bool +} + +func (g *availabilityGate) allow(now time.Time, baseURL string) (availabilityAttempt, bool) { + g.mu.Lock() + defer g.mu.Unlock() + + if g.baseURL != baseURL { + g.resetLocked(baseURL) + } + if !g.unavailable { + return availabilityAttempt{baseURL: baseURL}, true + } + if now.Before(g.nextProbe) || g.probing { + return availabilityAttempt{}, false + } + g.probing = true + return availabilityAttempt{baseURL: baseURL, probe: true}, true +} + +func (g *availabilityGate) failed(now time.Time, attempt availabilityAttempt, retryAfter string) time.Duration { + g.mu.Lock() + defer g.mu.Unlock() + if attempt.baseURL != g.baseURL { + return 0 + } + + g.unavailable = true + if attempt.probe { + g.probing = false + } + g.failures++ + delay, ok := parseRetryAfter(retryAfter, now) + if !ok { + delay = availabilityBackoffStart + for i := 1; i < g.failures && delay < availabilityBackoffMax; i++ { + delay *= 2 + } + if delay > availabilityBackoffMax { + delay = availabilityBackoffMax + } + } + g.nextProbe = now.Add(delay) + return delay +} + +func (g *availabilityGate) available(attempt availabilityAttempt) { + if !attempt.probe { + return + } + g.mu.Lock() + defer g.mu.Unlock() + if attempt.baseURL != g.baseURL { + return + } + g.resetLocked(g.baseURL) +} + +func (g *availabilityGate) reset(baseURL string) { + g.mu.Lock() + defer g.mu.Unlock() + g.resetLocked(baseURL) +} + +func (g *availabilityGate) resetLocked(baseURL string) { + g.baseURL = baseURL + g.unavailable = false + g.probing = false + g.failures = 0 + g.nextProbe = time.Time{} +} + +func parseRetryAfter(value string, now time.Time) (time.Duration, bool) { + if seconds, err := strconv.Atoi(strings.TrimSpace(value)); err == nil && seconds >= 0 { + return time.Duration(seconds) * time.Second, true + } + when, err := http.ParseTime(value) + if err != nil { + return 0, false + } + if when.Before(now) { + return 0, true + } + return when.Sub(now), true +} + +// NewAnthropicFirstHandler creates the opt-in Anthropic passthrough layer. +func NewAnthropicFirstHandler(atomic *config.AtomicConfig, fallback http.Handler) *AnthropicFirstHandler { + transport := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{Timeout: 10 * time.Second, KeepAlive: 30 * time.Second}).DialContext, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: time.Second, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 20, + IdleConnTimeout: 90 * time.Second, + } + return &AnthropicFirstHandler{ + atomic: atomic, + fallback: fallback, + client: &http.Client{ + Transport: transport, + CheckRedirect: func(_ *http.Request, _ []*http.Request) error { + return http.ErrUseLastResponse + }, + }, + logger: slog.Default(), + } +} + +func (h *AnthropicFirstHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + cfg := h.atomic.Get().AnthropicFirst + if !cfg.Enabled || r.Method != http.MethodPost { + if !cfg.Enabled { + h.gate.reset(cfg.BaseURL) + } + h.fallback.ServeHTTP(w, r) + return + } + + body, err := io.ReadAll(io.LimitReader(r.Body, maxAnthropicBodySize+1)) + if err != nil { + h.writeError(w, http.StatusBadRequest, "failed to read request body") + return + } + if len(body) > maxAnthropicBodySize { + h.writeError(w, http.StatusRequestEntityTooLarge, "request body too large") + return + } + + attempt, allowed := h.gate.allow(time.Now(), cfg.BaseURL) + if !allowed { + h.serveFallback(w, r, body) + return + } + + upstreamReq, err := newAnthropicRequest(r, cfg.BaseURL, body) + if err != nil { + h.writeError(w, http.StatusInternalServerError, "invalid Anthropic base URL") + return + } + resp, err := h.client.Do(upstreamReq) + if err != nil { + if r.Context().Err() != nil { + return + } + delay := h.gate.failed(time.Now(), attempt, "") + h.logger.Warn("Anthropic unavailable, using OpenCode", "error", err, "retry_in", delay) + h.serveFallback(w, r, body) + return + } + defer func() { _ = resp.Body.Close() }() + + if isAnthropicAvailabilityFailure(resp.StatusCode) { + delay := h.gate.failed(time.Now(), attempt, resp.Header.Get("Retry-After")) + h.logger.Warn("Anthropic unavailable, using OpenCode", "status", resp.StatusCode, "retry_in", delay) + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 64<<10)) + h.serveFallback(w, r, body) + return + } + + h.gate.available(attempt) + h.logger.Debug("Anthropic request succeeded", "status", resp.StatusCode) + copyHeader(w.Header(), resp.Header) + w.WriteHeader(resp.StatusCode) + copyStreamingResponse(w, resp.Body) +} + +func newAnthropicRequest(in *http.Request, baseURL string, body []byte) (*http.Request, error) { + base, err := url.Parse(baseURL) + if err != nil { + return nil, err + } + base.Path = strings.TrimRight(base.Path, "/") + in.URL.Path + base.RawQuery = in.URL.RawQuery + out, err := http.NewRequestWithContext(in.Context(), in.Method, base.String(), bytes.NewReader(body)) + if err != nil { + return nil, err + } + out.Header = in.Header.Clone() + removeHopHeaders(out.Header) + out.ContentLength = int64(len(body)) + return out, nil +} + +func isAnthropicAvailabilityFailure(status int) bool { + return status == http.StatusRequestTimeout || status == http.StatusTooManyRequests || status >= 500 +} + +func (h *AnthropicFirstHandler) serveFallback(w http.ResponseWriter, r *http.Request, body []byte) { + r.Body = io.NopCloser(bytes.NewReader(body)) + r.ContentLength = int64(len(body)) + h.fallback.ServeHTTP(w, r) +} + +func copyHeader(dst, src http.Header) { + removeHopHeaders(src) + for key, values := range src { + dst[key] = append([]string(nil), values...) + } +} + +func removeHopHeaders(header http.Header) { + for _, value := range header.Values("Connection") { + for _, key := range strings.Split(value, ",") { + header.Del(strings.TrimSpace(key)) + } + } + for _, key := range []string{ + "Connection", "Proxy-Connection", "Keep-Alive", "Proxy-Authenticate", + "Proxy-Authorization", "Te", "Trailer", "Transfer-Encoding", "Upgrade", + } { + header.Del(key) + } +} + +func copyStreamingResponse(w http.ResponseWriter, body io.Reader) { + buf := make([]byte, 32<<10) + for { + n, err := body.Read(buf) + if n > 0 { + if _, writeErr := w.Write(buf[:n]); writeErr != nil { + return + } + if flusher, ok := w.(http.Flusher); ok { + flusher.Flush() + } + } + if err != nil { + return + } + } +} + +func (h *AnthropicFirstHandler) writeError(w http.ResponseWriter, status int, message string) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(map[string]any{ + "type": "error", + "error": map[string]string{"type": "invalid_request_error", "message": message}, + }) +} diff --git a/internal/handlers/anthropic_first_test.go b/internal/handlers/anthropic_first_test.go new file mode 100644 index 0000000..751ac26 --- /dev/null +++ b/internal/handlers/anthropic_first_test.go @@ -0,0 +1,228 @@ +package handlers + +import ( + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/routatic/proxy/internal/config" +) + +func newAnthropicFirstTestHandler(baseURL string, enabled bool, fallback http.Handler) *AnthropicFirstHandler { + cfg := &config.Config{AnthropicFirst: config.AnthropicFirstConfig{Enabled: enabled, BaseURL: baseURL}} + return NewAnthropicFirstHandler(config.NewAtomicConfig(cfg, "/tmp/test-config.json"), fallback) +} + +func TestAnthropicFirstHealthyPassthrough(t *testing.T) { + var gotBody, gotBeta, gotAuth, gotPath string + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + gotBody = string(body) + gotBeta = r.Header.Get("anthropic-beta") + gotAuth = r.Header.Get("Authorization") + gotPath = r.URL.RequestURI() + w.Header().Set("Content-Type", "text/event-stream") + w.Header().Set("X-Upstream", "anthropic") + w.WriteHeader(http.StatusCreated) + _, _ = io.WriteString(w, "event: message_stop\ndata: {}\n\n") + })) + defer upstream.Close() + + var fallbackCalls atomic.Int32 + h := newAnthropicFirstTestHandler(upstream.URL, true, http.HandlerFunc(func(http.ResponseWriter, *http.Request) { + fallbackCalls.Add(1) + })) + req := httptest.NewRequest(http.MethodPost, "/v1/messages?beta=true", strings.NewReader(`{"model":"claude-sonnet-4-6"}`)) + req.Header.Set("Authorization", "Bearer oauth-token") + req.Header.Set("anthropic-beta", "oauth-2025-04-20,context-management-2025-06-27") + rec := httptest.NewRecorder() + + h.ServeHTTP(rec, req) + + if rec.Code != http.StatusCreated || rec.Header().Get("X-Upstream") != "anthropic" { + t.Fatalf("response = %d %v, want Anthropic 201", rec.Code, rec.Header()) + } + if gotBody != `{"model":"claude-sonnet-4-6"}` || gotAuth != "Bearer oauth-token" { + t.Fatalf("body/auth not forwarded: body=%q auth=%q", gotBody, gotAuth) + } + if gotBeta == "" || gotPath != "/v1/messages?beta=true" { + t.Fatalf("beta/path not forwarded: beta=%q path=%q", gotBeta, gotPath) + } + if fallbackCalls.Load() != 0 { + t.Fatal("fallback called for healthy Anthropic response") + } +} + +func TestAnthropicFirstFallsBackOnAvailabilityFailures(t *testing.T) { + for _, status := range []int{http.StatusRequestTimeout, http.StatusTooManyRequests, 500, 503, 529} { + t.Run(http.StatusText(status), func(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Retry-After", "120") + http.Error(w, "unavailable", status) + })) + defer upstream.Close() + + var gotBody string + h := newAnthropicFirstTestHandler(upstream.URL, true, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + gotBody = string(body) + w.Header().Set("X-Upstream", "opencode") + w.WriteHeader(http.StatusOK) + })) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("request-body"))) + + if rec.Code != http.StatusOK || rec.Header().Get("X-Upstream") != "opencode" || gotBody != "request-body" { + t.Fatalf("fallback response=%d header=%q body=%q", rec.Code, rec.Header().Get("X-Upstream"), gotBody) + } + }) + } +} + +func TestAnthropicFirstPreservesClientErrors(t *testing.T) { + for _, status := range []int{400, 401, 403, 404, 422} { + t.Run(http.StatusText(status), func(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("X-Error", "original") + w.WriteHeader(status) + _, _ = io.WriteString(w, `{"type":"error","error":{"message":"original"}}`) + })) + defer upstream.Close() + + var fallbackCalls atomic.Int32 + h := newAnthropicFirstTestHandler(upstream.URL, true, http.HandlerFunc(func(http.ResponseWriter, *http.Request) { + fallbackCalls.Add(1) + })) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))) + + if rec.Code != status || rec.Header().Get("X-Error") != "original" || !strings.Contains(rec.Body.String(), "original") { + t.Fatalf("response was not preserved: status=%d headers=%v body=%q", rec.Code, rec.Header(), rec.Body.String()) + } + if fallbackCalls.Load() != 0 { + t.Fatal("fallback called for client error") + } + }) + } +} + +func TestAnthropicFirstDisabledUsesFallback(t *testing.T) { + var calls atomic.Int32 + h := newAnthropicFirstTestHandler("https://api.anthropic.com", false, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + calls.Add(1) + w.WriteHeader(http.StatusNoContent) + })) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))) + if calls.Load() != 1 || rec.Code != http.StatusNoContent { + t.Fatalf("disabled mode calls=%d status=%d", calls.Load(), rec.Code) + } +} + +func TestAnthropicFirstTransportFailureFallsBack(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {})) + baseURL := upstream.URL + upstream.Close() + + var calls atomic.Int32 + h := newAnthropicFirstTestHandler(baseURL, true, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + calls.Add(1) + w.WriteHeader(http.StatusNoContent) + })) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))) + if calls.Load() != 1 || rec.Code != http.StatusNoContent { + t.Fatalf("transport fallback calls=%d status=%d", calls.Load(), rec.Code) + } +} + +func TestAnthropicFirstDoesNotFollowRedirectWithCredentials(t *testing.T) { + var redirectedCalls atomic.Int32 + destination := httptest.NewServer(http.HandlerFunc(func(http.ResponseWriter, *http.Request) { + redirectedCalls.Add(1) + })) + defer destination.Close() + source := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Location", destination.URL) + w.WriteHeader(http.StatusTemporaryRedirect) + })) + defer source.Close() + + h := newAnthropicFirstTestHandler(source.URL, true, http.HandlerFunc(func(http.ResponseWriter, *http.Request) { + t.Fatal("fallback called for redirect") + })) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/v1/messages", strings.NewReader("{}"))) + if rec.Code != http.StatusTemporaryRedirect || redirectedCalls.Load() != 0 { + t.Fatalf("status=%d redirectedCalls=%d", rec.Code, redirectedCalls.Load()) + } +} + +func TestAvailabilityGateBackoffAndSingleProbe(t *testing.T) { + var gate availabilityGate + now := time.Date(2026, 6, 28, 12, 0, 0, 0, time.UTC) + attempt, ok := gate.allow(now, "https://api.anthropic.com") + if !ok || attempt.probe { + t.Fatal("healthy gate should allow a normal request") + } + if delay := gate.failed(now, attempt, ""); delay != 30*time.Second { + t.Fatalf("first backoff=%v, want 30s", delay) + } + if _, ok := gate.allow(now.Add(29*time.Second), "https://api.anthropic.com"); ok { + t.Fatal("request allowed before backoff elapsed") + } + probe, ok := gate.allow(now.Add(30*time.Second), "https://api.anthropic.com") + if !ok || !probe.probe { + t.Fatal("expected half-open probe") + } + if _, ok := gate.allow(now.Add(30*time.Second), "https://api.anthropic.com"); ok { + t.Fatal("concurrent half-open probe allowed") + } + if delay := gate.failed(now.Add(30*time.Second), probe, "120"); delay != 2*time.Minute { + t.Fatalf("Retry-After delay=%v, want 2m", delay) + } + probe, ok = gate.allow(now.Add(150*time.Second), "https://api.anthropic.com") + if !ok || !probe.probe { + t.Fatal("expected second half-open probe") + } + gate.available(probe) + if attempt, ok = gate.allow(now.Add(150*time.Second), "https://api.anthropic.com"); !ok || attempt.probe { + t.Fatal("successful probe did not close gate") + } +} + +func TestAvailabilityGateConcurrentProbe(t *testing.T) { + var gate availabilityGate + now := time.Now() + attempt, _ := gate.allow(now, "upstream") + gate.failed(now, attempt, "0") + + var allowed atomic.Int32 + var wg sync.WaitGroup + for range 20 { + wg.Add(1) + go func() { + defer wg.Done() + if _, ok := gate.allow(now, "upstream"); ok { + allowed.Add(1) + } + }() + } + wg.Wait() + if allowed.Load() != 1 { + t.Fatalf("allowed %d probes, want 1", allowed.Load()) + } +} + +func TestParseRetryAfterHTTPDate(t *testing.T) { + now := time.Date(2026, 6, 28, 12, 0, 0, 0, time.UTC) + delay, ok := parseRetryAfter(now.Add(90*time.Second).Format(http.TimeFormat), now) + if !ok || delay != 90*time.Second { + t.Fatalf("delay=%v ok=%v, want 90s", delay, ok) + } +} diff --git a/internal/handlers/messages.go b/internal/handlers/messages.go index 745550f..4dd2be3 100644 --- a/internal/handlers/messages.go +++ b/internal/handlers/messages.go @@ -502,6 +502,7 @@ func (h *MessagesHandler) handleStreaming( defer heartbeatCancel() streamStart := time.Now() + blockedProviders := make(map[string]bool) for _, model := range modelChain { select { @@ -510,6 +511,11 @@ func (h *MessagesHandler) handleStreaming( return default: } + providerName := client.Provider(model) + if blockedProviders[providerName] { + h.logger.Info("provider usage limit reached, skipping streaming model", "provider", providerName, "model", model.ModelID) + continue + } h.logger.Info("attempting streaming model", "model", model.ModelID, "provider", model.Provider) @@ -593,6 +599,9 @@ func (h *MessagesHandler) handleStreaming( h.logger.Debug("client disconnected during upstream request") return } + if router.IsUsageLimitError(err) { + blockedProviders[providerName] = true + } h.logger.Warn("streaming request failed via provider", "model", model.ModelID, "provider", model.Provider, "error", err) continue } diff --git a/internal/handlers/messages_test.go b/internal/handlers/messages_test.go index 34331dc..5a113bc 100644 --- a/internal/handlers/messages_test.go +++ b/internal/handlers/messages_test.go @@ -356,6 +356,87 @@ func equalStrings(a, b []string) bool { return true } +type usageLimitStreamProvider struct { + name string + calls *int + err error + body string +} + +func (p *usageLimitStreamProvider) Name() string { return p.name } +func (p *usageLimitStreamProvider) Capabilities() core.ProviderCapabilities { + return core.ProviderCapabilities{SupportsStreaming: true, SupportsTools: true} +} +func (p *usageLimitStreamProvider) ModelCapabilities(string) (core.ProviderCapabilities, bool) { + return p.Capabilities(), true +} +func (p *usageLimitStreamProvider) WireFormat(string) core.WireFormat { + return core.WireFormatAnthropic +} +func (p *usageLimitStreamProvider) Execute(context.Context, *core.NormalizedRequest, config.ModelConfig) (*core.ExecuteResult, error) { + return nil, p.err +} +func (p *usageLimitStreamProvider) Stream(context.Context, *core.NormalizedRequest, config.ModelConfig) (io.ReadCloser, error) { + *p.calls++ + if p.err != nil { + return nil, p.err + } + return io.NopCloser(strings.NewReader(p.body)), nil +} +func (p *usageLimitStreamProvider) RoundTripName(model config.ModelConfig) string { + return model.ModelID +} +func (p *usageLimitStreamProvider) StreamIdleTimeout(config.ModelConfig) time.Duration { + return time.Minute +} + +func TestHandleStreaming_UsageLimitSkipsRemainingProviderModels(t *testing.T) { + cfg := &config.Config{ + OpenCodeGo: config.OpenCodeGoConfig{TimeoutMs: 5000, StreamTimeoutMs: 5000}, + OpenCodeZen: config.OpenCodeZenConfig{TimeoutMs: 5000, StreamTimeoutMs: 5000}, + } + atomicCfg := config.NewAtomicConfig(cfg, "/tmp/test-config.json") + registry := core.NewProviderRegistry() + goCalls, zenCalls := 0, 0 + _ = registry.Register(&usageLimitStreamProvider{ + name: "opencode-go", calls: &goCalls, + err: &client.APIError{StatusCode: 429, Body: `{"type":"GoUsageLimitError"}`}, + }) + _ = registry.Register(&usageLimitStreamProvider{ + name: "opencode-zen", calls: &zenCalls, + body: "event: message_start\ndata: {}\n\nevent: message_stop\ndata: {}\n\n", + }) + h := &MessagesHandler{ + client: client.NewOpenCodeClient(atomicCfg, nil), + providerRegistry: registry, + streamProxy: NewStreamProxy(), + logger: slog.Default(), + metrics: metrics.New(), + } + stream := true + req := httptest.NewRequest(http.MethodPost, "/v1/messages", nil) + rec := httptest.NewRecorder() + h.handleStreaming( + rec, + req, + &types.MessageRequest{Stream: &stream}, + &core.NormalizedRequest{Stream: true}, + []config.ModelConfig{ + {Provider: "opencode-go", ModelID: "deepseek-v4-pro"}, + {Provider: "opencode-go", ModelID: "qwen3.7-plus"}, + {Provider: "opencode-zen", ModelID: "nemotron-3-ultra-free"}, + }, + nil, + router.ScenarioDefault, + ) + if goCalls != 1 || zenCalls != 1 { + t.Fatalf("goCalls=%d zenCalls=%d; want 1 each", goCalls, zenCalls) + } + if !strings.Contains(rec.Body.String(), "message_stop") { + t.Fatalf("Zen stream not returned: %q", rec.Body.String()) + } +} + func TestSanitizeAnthropicBody_RemovesToolTypeField(t *testing.T) { rawBody := json.RawMessage(`{ "model": "minimax-m3", diff --git a/internal/provider/opencode_zen.go b/internal/provider/opencode_zen.go index 7e60b11..d4c0eef 100644 --- a/internal/provider/opencode_zen.go +++ b/internal/provider/opencode_zen.go @@ -60,6 +60,8 @@ func (p *OpenCodeZenProvider) ModelCapabilities(modelID string) (core.ProviderCa caps.MaxContextLength = 1_000_000 case strings.HasPrefix(modelID, "deepseek-"): caps.MaxContextLength = 1_000_000 + case strings.HasPrefix(modelID, "qwen"): + caps.MaxContextLength = 1_000_000 } return caps, true } diff --git a/internal/router/fallback.go b/internal/router/fallback.go index 4beda55..34f95f4 100644 --- a/internal/router/fallback.go +++ b/internal/router/fallback.go @@ -175,6 +175,8 @@ func (h *FallbackHandler) ExecuteWithFallback( executor func(context.Context, config.ModelConfig) ([]byte, error), ) (*FallbackResult, []byte, error) { totalModels := len(models) + blockedProviders := make(map[string]bool) + var usageLimitErr error for i, model := range models { if err := ctx.Err(); err != nil { @@ -184,6 +186,12 @@ func (h *FallbackHandler) ExecuteWithFallback( return nil, nil, err } + provider := client.Provider(model) + if blockedProviders[provider] { + h.logger.Info("provider usage limit reached, skipping model", "provider", provider, "model", model.ModelID) + continue + } + cb := h.getCircuitBreaker(model.ModelID) // Skip models with open circuit breakers @@ -225,15 +233,17 @@ func (h *FallbackHandler) ExecuteWithFallback( return nil, nil, errCtx } - // Usage limit errors should be passed directly to the client instead - // of triggering a fallback, as fallback attempts will also encounter - // the same usage limit error within a short period. + // A provider-wide usage limit makes its remaining models pointless. + // Skip them, but continue if the chain includes another provider. if IsUsageLimitError(err) { - h.logger.Warn("usage limit error encountered, passing directly to client", + usageLimitErr = err + blockedProviders[provider] = true + h.logger.Warn("provider usage limit reached, trying another provider", + "provider", provider, "model", model.ModelID, "error", err, ) - return nil, nil, err + continue } if IsRetryableError(err) { @@ -253,6 +263,15 @@ func (h *FallbackHandler) ExecuteWithFallback( } } + if usageLimitErr != nil { + return &FallbackResult{ + ModelID: models[0].ModelID, + Success: false, + Attempted: totalModels, + TotalModels: totalModels, + }, nil, usageLimitErr + } + return &FallbackResult{ ModelID: models[0].ModelID, Success: false, diff --git a/internal/router/fallback_test.go b/internal/router/fallback_test.go index e8117d9..94e7f8f 100644 --- a/internal/router/fallback_test.go +++ b/internal/router/fallback_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "log/slog" + "strings" "testing" "time" @@ -260,6 +261,48 @@ func TestExecuteWithFallback_PerModelTimeoutFallback(t *testing.T) { } } +func TestExecuteWithFallback_UsageLimitSkipsProvider(t *testing.T) { + h := NewFallbackHandler(nil, 3, time.Minute) + models := []config.ModelConfig{ + {Provider: "opencode-go", ModelID: "deepseek-v4-pro"}, + {Provider: "opencode-go", ModelID: "qwen3.7-plus"}, + {Provider: "opencode-zen", ModelID: "nemotron-3-ultra-free"}, + } + var attempted []string + result, body, err := h.ExecuteWithFallback(context.Background(), models, func(_ context.Context, model config.ModelConfig) ([]byte, error) { + attempted = append(attempted, model.ModelID) + if model.Provider == "opencode-go" { + return nil, &client.APIError{StatusCode: 429, Body: `{"type":"GoUsageLimitError"}`} + } + return []byte("zen-success"), nil + }) + if err != nil { + t.Fatal(err) + } + if result.ModelID != "nemotron-3-ultra-free" || string(body) != "zen-success" { + t.Fatalf("result=%+v body=%q", result, body) + } + if strings.Join(attempted, ",") != "deepseek-v4-pro,nemotron-3-ultra-free" { + t.Fatalf("attempted=%v; remaining Go models should be skipped", attempted) + } +} + +func TestExecuteWithFallback_UsageLimitWithoutAlternateProviderIsPreserved(t *testing.T) { + h := NewFallbackHandler(nil, 3, time.Minute) + models := []config.ModelConfig{ + {Provider: "opencode-go", ModelID: "a"}, + {Provider: "opencode-go", ModelID: "b"}, + } + calls := 0 + _, _, err := h.ExecuteWithFallback(context.Background(), models, func(_ context.Context, _ config.ModelConfig) ([]byte, error) { + calls++ + return nil, &client.APIError{StatusCode: 429, Body: `{"type":"GoUsageLimitError"}`} + }) + if !IsUsageLimitError(err) || calls != 1 { + t.Fatalf("err=%v calls=%d", err, calls) + } +} + func TestExecuteWithFallback_RealPerModelTimeout(t *testing.T) { logger := slog.Default() handler := NewFallbackHandler(logger, 3, 30*time.Second) diff --git a/internal/server/server.go b/internal/server/server.go index b0a6688..09e9a01 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -90,7 +90,7 @@ func NewServer(atomic *config.AtomicConfig, captureLogger *debug.CaptureLogger) mux := http.NewServeMux() // API routes. - mux.HandleFunc("/v1/messages", messagesHandler.HandleMessages) + mux.Handle("/v1/messages", handlers.NewAnthropicFirstHandler(atomic, http.HandlerFunc(messagesHandler.HandleMessages))) mux.HandleFunc("/v1/messages/count_tokens", healthHandler.HandleCountTokens) mux.HandleFunc("/health", healthHandler.HandleHealth) mux.HandleFunc("/statusline", healthHandler.HandleStatusline) From ef6bfefbd1c5b6cfe610d11640557305a10b625a Mon Sep 17 00:00:00 2001 From: samuel tuyizere Date: Tue, 30 Jun 2026 10:21:17 +0200 Subject: [PATCH 2/3] fix: improve availability gate handling and response streaming --- internal/handlers/anthropic_first.go | 34 ++++++++++++++++++----- internal/handlers/anthropic_first_test.go | 6 +++- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/internal/handlers/anthropic_first.go b/internal/handlers/anthropic_first.go index c774702..01e0b8f 100644 --- a/internal/handlers/anthropic_first.go +++ b/internal/handlers/anthropic_first.go @@ -2,6 +2,7 @@ package handlers import ( "bytes" + "context" "encoding/json" "io" "log/slog" @@ -117,14 +118,18 @@ func (g *availabilityGate) resetLocked(baseURL string) { func parseRetryAfter(value string, now time.Time) (time.Duration, bool) { if seconds, err := strconv.Atoi(strings.TrimSpace(value)); err == nil && seconds >= 0 { - return time.Duration(seconds) * time.Second, true + delay := time.Duration(seconds) * time.Second + if delay < time.Second { + delay = time.Second + } + return delay, true } when, err := http.ParseTime(value) if err != nil { return 0, false } if when.Before(now) { - return 0, true + return time.Second, true } return when.Sub(now), true } @@ -208,7 +213,7 @@ func (h *AnthropicFirstHandler) ServeHTTP(w http.ResponseWriter, r *http.Request h.logger.Debug("Anthropic request succeeded", "status", resp.StatusCode) copyHeader(w.Header(), resp.Header) w.WriteHeader(resp.StatusCode) - copyStreamingResponse(w, resp.Body) + copyStreamingResponse(r.Context(), w, resp.Body) } func newAnthropicRequest(in *http.Request, baseURL string, body []byte) (*http.Request, error) { @@ -239,8 +244,9 @@ func (h *AnthropicFirstHandler) serveFallback(w http.ResponseWriter, r *http.Req } func copyHeader(dst, src http.Header) { - removeHopHeaders(src) - for key, values := range src { + dstClone := src.Clone() + removeHopHeaders(dstClone) + for key, values := range dstClone { dst[key] = append([]string(nil), values...) } } @@ -259,9 +265,23 @@ func removeHopHeaders(header http.Header) { } } -func copyStreamingResponse(w http.ResponseWriter, body io.Reader) { - buf := make([]byte, 32<<10) +var streamBufPool = sync.Pool{ + New: func() any { + b := make([]byte, 32<<10) + return &b + }, +} + +func copyStreamingResponse(ctx context.Context, w http.ResponseWriter, body io.Reader) { + bufPtr := streamBufPool.Get().(*[]byte) + defer streamBufPool.Put(bufPtr) + buf := *bufPtr for { + select { + case <-ctx.Done(): + return + default: + } n, err := body.Read(buf) if n > 0 { if _, writeErr := w.Write(buf[:n]); writeErr != nil { diff --git a/internal/handlers/anthropic_first_test.go b/internal/handlers/anthropic_first_test.go index 751ac26..7e839b9 100644 --- a/internal/handlers/anthropic_first_test.go +++ b/internal/handlers/anthropic_first_test.go @@ -202,13 +202,17 @@ func TestAvailabilityGateConcurrentProbe(t *testing.T) { attempt, _ := gate.allow(now, "upstream") gate.failed(now, attempt, "0") + // After failed("0"), the gate is unavailable with nextProbe = now + 1s (minimum floor). + // Wait until the probe window opens. + probeTime := now.Add(1100 * time.Millisecond) + var allowed atomic.Int32 var wg sync.WaitGroup for range 20 { wg.Add(1) go func() { defer wg.Done() - if _, ok := gate.allow(now, "upstream"); ok { + if _, ok := gate.allow(probeTime, "upstream"); ok { allowed.Add(1) } }() From b6a7f5cdbb13e854c78722ce10dca372d80851e4 Mon Sep 17 00:00:00 2001 From: samuel tuyizere Date: Tue, 30 Jun 2026 11:01:10 +0200 Subject: [PATCH 3/3] fix: simplify retry delay logic in availability gate and adjust test probe timing --- internal/handlers/anthropic_first.go | 8 ++------ internal/handlers/anthropic_first_test.go | 5 ++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/internal/handlers/anthropic_first.go b/internal/handlers/anthropic_first.go index 01e0b8f..6957c87 100644 --- a/internal/handlers/anthropic_first.go +++ b/internal/handlers/anthropic_first.go @@ -118,18 +118,14 @@ func (g *availabilityGate) resetLocked(baseURL string) { func parseRetryAfter(value string, now time.Time) (time.Duration, bool) { if seconds, err := strconv.Atoi(strings.TrimSpace(value)); err == nil && seconds >= 0 { - delay := time.Duration(seconds) * time.Second - if delay < time.Second { - delay = time.Second - } - return delay, true + return time.Duration(seconds) * time.Second, true } when, err := http.ParseTime(value) if err != nil { return 0, false } if when.Before(now) { - return time.Second, true + return 0, true } return when.Sub(now), true } diff --git a/internal/handlers/anthropic_first_test.go b/internal/handlers/anthropic_first_test.go index 7e839b9..b9caa1f 100644 --- a/internal/handlers/anthropic_first_test.go +++ b/internal/handlers/anthropic_first_test.go @@ -202,9 +202,8 @@ func TestAvailabilityGateConcurrentProbe(t *testing.T) { attempt, _ := gate.allow(now, "upstream") gate.failed(now, attempt, "0") - // After failed("0"), the gate is unavailable with nextProbe = now + 1s (minimum floor). - // Wait until the probe window opens. - probeTime := now.Add(1100 * time.Millisecond) + // Retry-After: 0 means the probe window opens immediately. + probeTime := now.Add(50 * time.Millisecond) var allowed atomic.Int32 var wg sync.WaitGroup