Auto-Coding/verify_per_agent_provider.py at develop · OBenner/Auto-Coding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
"""
Quick verification script for per-agent provider selection.
Tests the core functionality without requiring full pytest setup.
"""

import json
import os
import sys
from pathlib import Path

# Add apps/backend to path
sys.path.insert(0, str(Path(__file__).parent / "apps" / "backend"))

def test_per_agent_provider_configuration():
    """Test that per-agent provider configuration works."""
    from phase_config import get_provider_for_agent

    print("Testing per-agent provider configuration...")

    # Save ALL env vars we'll mutate
    env_vars_to_save = [
        "AI_ENGINE_PROVIDER",
        "AGENT_PROVIDER_PLANNER",
        "AGENT_PROVIDER_CODER",
        "AGENT_PROVIDER_QA_REVIEWER",
        "AGENT_MODEL_PLANNER",
        "AGENT_MODEL_CODER",
        "OPENAI_API_KEY",
        "ANTHROPIC_API_KEY",
    ]
    saved_env = {k: os.environ.get(k) for k in env_vars_to_save}

    try:
        # Set test configuration
        os.environ["AI_ENGINE_PROVIDER"] = "claude"
        os.environ["AGENT_PROVIDER_PLANNER"] = "claude"
        os.environ["AGENT_MODEL_PLANNER"] = "claude-opus-4-20250514"
        os.environ["AGENT_PROVIDER_CODER"] = "litellm"
        os.environ["AGENT_MODEL_CODER"] = "gpt-4"

        # Test planner uses claude
        planner_provider = get_provider_for_agent("planner")
        assert planner_provider == "claude", f"✗ Planner should use Claude, got {planner_provider}"
        print("✓ Planner uses Claude")

        # Test coder uses litellm
        coder_provider = get_provider_for_agent("coder")
        assert coder_provider == "litellm", f"✗ Coder should use LiteLLM, got {coder_provider}"
        print("✓ Coder uses LiteLLM")

        # Test fallback to default
        os.environ.pop("AGENT_PROVIDER_QA_REVIEWER", None)
        qa_provider = get_provider_for_agent("qa_reviewer")
        assert qa_provider == "claude", f"✗ QA reviewer should fall back to Claude, got {qa_provider}"
        print("✓ QA reviewer falls back to default Claude")

    finally:
        # Restore ALL saved env vars
        for key, value in saved_env.items():
            if value is not None:
                os.environ[key] = value
            else:
                os.environ.pop(key, None)


def test_cost_tracking_multi_provider():
    """Test that cost tracking works with multiple providers."""
    from core.cost_tracking import CostTracker, MODEL_PRICING
    import tempfile
    import shutil

    print("\nTesting multi-provider cost tracking...")

    # Create temp directory
    temp_dir = Path(tempfile.mkdtemp())
    spec_dir = temp_dir / "test_spec"
    spec_dir.mkdir(parents=True)

    try:
        tracker = CostTracker(spec_dir=spec_dir)

        # Log usage for both providers
        tracker.log_usage("planner", "claude-opus-4-20250514", 2000, 1000)
        tracker.log_usage("coder", "gpt-4", 1000, 500)

        # Read cost report
        cost_report_file = spec_dir / "cost_report.json"
        assert cost_report_file.exists(), "✗ cost_report.json should be created"

        with open(cost_report_file, "r") as f:
            cost_report = json.load(f)

        # Verify structure
        assert "records" in cost_report, "✗ Should have records"
        assert len(cost_report["records"]) == 2, f"✗ Should have 2 records, got {len(cost_report['records'])}"
        print("✓ Cost report has 2 records")

        # Verify both models are tracked
        models = [r["model"] for r in cost_report["records"]]
        assert any("claude" in m for m in models), "✗ Should track Claude usage"
        print("✓ Claude usage tracked")

        assert any("gpt-4" in m for m in models), "✗ Should track GPT-4 usage"
        print("✓ GPT-4 usage tracked")

        # Verify agent types
        agent_types = [r["agent_type"] for r in cost_report["records"]]
        assert "planner" in agent_types, "✗ Should track planner agent"
        assert "coder" in agent_types, "✗ Should track coder agent"
        print("✓ Both agent types tracked")

        # Verify cost calculation
        assert cost_report["total_cost"] > 0, "✗ Total cost should be positive"
        print(f"✓ Total cost calculated: ${cost_report['total_cost']:.6f}")

        # Verify individual costs
        claude_pricing = MODEL_PRICING.get("claude-opus-4-20250514", MODEL_PRICING["claude-sonnet-4-5-20250929"])
        expected_claude = (2000 / 1_000_000 * claude_pricing["input"]) + (1000 / 1_000_000 * claude_pricing["output"])

        gpt4_pricing = MODEL_PRICING["gpt-4"]
        expected_gpt4 = (1000 / 1_000_000 * gpt4_pricing["input"]) + (500 / 1_000_000 * gpt4_pricing["output"])

        expected_total = expected_claude + expected_gpt4
        actual_total = cost_report["total_cost"]

        assert abs(actual_total - expected_total) < 0.001, (
            f"✗ Cost mismatch: expected {expected_total:.6f}, got {actual_total:.6f}"
        )
        print(f"✓ Cost calculation correct (Claude: ${expected_claude:.6f}, GPT-4: ${expected_gpt4:.6f})")

    finally:
        # Cleanup
        shutil.rmtree(temp_dir, ignore_errors=True)


def test_model_pricing():
    """Test that pricing exists for both Claude and OpenAI models."""
    from core.cost_tracking import MODEL_PRICING

    print("\nTesting model pricing database...")

    # Check Claude pricing
    claude_models = [k for k in MODEL_PRICING.keys() if "claude" in k]
    assert len(claude_models) > 0, "✗ Should have Claude model pricing"
    print(f"✓ Claude pricing exists ({len(claude_models)} models)")

    # Check OpenAI pricing
    assert "gpt-4" in MODEL_PRICING, "✗ Should have GPT-4 pricing"
    gpt4_pricing = MODEL_PRICING["gpt-4"]
    assert "input" in gpt4_pricing, "✗ GPT-4 should have input pricing"
    assert "output" in gpt4_pricing, "✗ GPT-4 should have output pricing"
    assert gpt4_pricing["input"] > 0, "✗ GPT-4 input price should be positive"
    assert gpt4_pricing["output"] > 0, "✗ GPT-4 output price should be positive"
    print("✓ GPT-4 pricing exists and is positive")

    # Check multiple GPT models
    gpt_models = [k for k in MODEL_PRICING.keys() if k.startswith("gpt-")]
    print(f"✓ OpenAI pricing exists ({len(gpt_models)} models)")

    # Check Ollama models (zero cost)
    ollama_models = [k for k in MODEL_PRICING.keys() if k.startswith("ollama/")]
    if ollama_models:
        print(f"✓ Ollama pricing exists ({len(ollama_models)} models, zero cost)")


def main():
    """Run all verification tests."""
    print("="*70)
    print("Per-Agent Provider Selection - Verification Tests")
    print("="*70)

    try:
        test_per_agent_provider_configuration()
        test_cost_tracking_multi_provider()
        test_model_pricing()

        print("\n" + "="*70)
        print("✓ ALL TESTS PASSED")
        print("="*70)
        print("\nVerification Summary:")
        print("✓ Per-agent provider configuration works")
        print("✓ Planner can use Claude while coder uses OpenAI")
        print("✓ Cost tracking correctly records both providers")
        print("✓ cost_report.json shows usage from both Claude and OpenAI")
        print("\nThe E2E test file has been created and follows the same patterns")
        print("as existing E2E tests (test_openai_provider_e2e.py, test_ollama_provider_e2e.py).")
        print("\nTo run the full E2E test suite:")
        print("  pytest tests/test_per_agent_provider_e2e.py -v")

        return 0

    except AssertionError as e:
        print(f"\n✗ TEST FAILED: {e}")
        return 1
    except Exception as e:
        print(f"\n✗ ERROR: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(main())