Skip to content

Commit 8a2fe11

Browse files
hydropixKeinNiemandclaude
committed
Merge branch 'benchmark_openai' - Add OpenAI-compatible provider support (#114)
Co-Authored-By: KeinNiemand <KeinNiemand@users.noreply.github.com> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2 parents a1d59dc + 735f609 commit 8a2fe11

5 files changed

Lines changed: 262 additions & 27 deletions

File tree

benchmark/cli.py

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616
from benchmark.runner import BenchmarkRunner, quick_benchmark, full_benchmark
1717
from benchmark.results.storage import ResultsStorage
1818
from benchmark.wiki.generator import WikiGenerator
19-
from benchmark.translator import get_available_ollama_models, get_available_openrouter_models
19+
from benchmark.translator import (
20+
get_available_ollama_models,
21+
get_available_openrouter_models,
22+
get_available_openai_models,
23+
)
2024

2125

2226
# ANSI color codes for terminal output
@@ -75,6 +79,8 @@ def cmd_run(args: argparse.Namespace) -> int:
7579
evaluator_provider = getattr(args, 'evaluator_provider', DEFAULT_EVALUATOR_PROVIDER)
7680
config = BenchmarkConfig.from_cli_args(
7781
openrouter_key=args.openrouter_key,
82+
openai_key=args.openai_key,
83+
openai_endpoint=args.openai_endpoint,
7884
poe_key=args.poe_key,
7985
evaluator_model=args.evaluator,
8086
ollama_endpoint=args.ollama_endpoint,
@@ -101,6 +107,14 @@ def cmd_run(args: argparse.Namespace) -> int:
101107
# Extract model IDs
102108
models = [m["id"] if isinstance(m, dict) else m for m in models_data[:10]]
103109
print(colored(f"Found {len(models_data)} models. Using top 10: {', '.join(models[:3])}...", Colors.GREEN))
110+
elif provider == "openai":
111+
print(colored("Fetching available OpenAI-compatible models...", Colors.CYAN))
112+
models_data = asyncio.run(get_available_openai_models(config))
113+
if not models_data:
114+
log_callback("error", "No OpenAI-compatible models available.")
115+
return 1
116+
models = [m["id"] if isinstance(m, dict) else m for m in models_data[:10]]
117+
print(colored(f"Found {len(models_data)} models. Using top 10: {', '.join(models[:3])}...", Colors.GREEN))
104118
else:
105119
print(colored("Detecting available Ollama models...", Colors.CYAN))
106120
models = asyncio.run(get_available_ollama_models(config))
@@ -288,7 +302,12 @@ def cmd_models(args: argparse.Namespace) -> int:
288302
"""List available models for benchmarking."""
289303
print_banner()
290304

291-
config = BenchmarkConfig.from_cli_args(openrouter_key=args.openrouter_key)
305+
config = BenchmarkConfig.from_cli_args(
306+
openrouter_key=args.openrouter_key,
307+
openai_key=args.openai_key,
308+
openai_endpoint=args.openai_endpoint,
309+
translation_provider=args.provider,
310+
)
292311
provider = args.provider
293312

294313
if provider == "openrouter":
@@ -322,6 +341,32 @@ def cmd_models(args: argparse.Namespace) -> int:
322341
print(colored("Tip: Use -m to specify models, e.g.:", Colors.YELLOW))
323342
print(" python -m benchmark.cli run -p openrouter -m anthropic/claude-sonnet-4 openai/gpt-4o")
324343

344+
elif provider == "openai":
345+
print(colored("Fetching OpenAI-compatible models...\n", Colors.CYAN))
346+
models = asyncio.run(get_available_openai_models(config))
347+
348+
if not models:
349+
log_callback("error", "Failed to fetch OpenAI-compatible models")
350+
return 1
351+
352+
print(colored(f"Available OpenAI-Compatible Models ({len(models)}):\n", Colors.BOLD))
353+
print(f"{'Model ID':<50} {'Owner':<20}")
354+
print("-" * 72)
355+
356+
for model in models[:50]:
357+
if isinstance(model, dict):
358+
model_id = model.get("id", "unknown")
359+
owned_by = model.get("owned_by", "unknown")
360+
else:
361+
model_id = model
362+
owned_by = "unknown"
363+
364+
print(f"{model_id:<50} {owned_by:<20}")
365+
366+
print()
367+
print(colored("Tip: Use -m and --openai-endpoint to specify a backend, e.g.:", Colors.YELLOW))
368+
print(" python -m benchmark.cli run -p openai --openai-endpoint http://localhost:8080/v1 -m your-model")
369+
325370
else:
326371
print(colored("Detecting Ollama models...\n", Colors.CYAN))
327372
models = asyncio.run(get_available_ollama_models(config))
@@ -566,6 +611,9 @@ def create_parser() -> argparse.ArgumentParser:
566611
# Quick benchmark with Ollama (local models)
567612
python -m benchmark.cli run --openrouter-key YOUR_KEY
568613
614+
# Quick benchmark with an OpenAI-compatible backend
615+
python -m benchmark.cli run --provider openai --openai-endpoint http://localhost:8080/v1 -m your-model
616+
569617
# Quick benchmark with OpenRouter (cloud models)
570618
python -m benchmark.cli run --provider openrouter --openrouter-key YOUR_KEY
571619
@@ -578,6 +626,9 @@ def create_parser() -> argparse.ArgumentParser:
578626
# Specific OpenRouter models
579627
python -m benchmark.cli run -p openrouter -m anthropic/claude-sonnet-4 openai/gpt-4o -l fr de ja
580628
629+
# Specific OpenAI-compatible backend and models
630+
python -m benchmark.cli run -p openai --openai-endpoint http://localhost:8080/v1 -m qwen2.5-14b-instruct
631+
581632
# Generate wiki pages
582633
python -m benchmark.cli wiki
583634
@@ -594,6 +645,7 @@ def create_parser() -> argparse.ArgumentParser:
594645
"-m", "--models",
595646
nargs="+",
596647
help="Models to benchmark. For Ollama: model names (e.g., llama3:8b). "
648+
"For OpenAI-compatible backends: model IDs (e.g., gpt-4o or local server model names). "
597649
"For OpenRouter: model IDs (e.g., anthropic/claude-sonnet-4). "
598650
"If not specified, auto-detects available models."
599651
)
@@ -609,9 +661,17 @@ def create_parser() -> argparse.ArgumentParser:
609661
)
610662
run_parser.add_argument(
611663
"-p", "--provider",
612-
choices=["ollama", "openrouter"],
664+
choices=["ollama", "openai", "openrouter"],
613665
default="ollama",
614-
help="Translation provider: 'ollama' (local, default) or 'openrouter' (cloud, 200+ models)"
666+
help="Translation provider: 'ollama' (local, default), 'openai' (OpenAI-compatible), or 'openrouter' (cloud, 200+ models)"
667+
)
668+
run_parser.add_argument(
669+
"--openai-key",
670+
help="API key for OpenAI-compatible translation backends. Can also be set via OPENAI_API_KEY env var."
671+
)
672+
run_parser.add_argument(
673+
"--openai-endpoint",
674+
help="OpenAI-compatible chat completions endpoint or /v1 base URL. Can also be set via OPENAI_API_ENDPOINT env var."
615675
)
616676
run_parser.add_argument(
617677
"--openrouter-key",
@@ -696,10 +756,18 @@ def create_parser() -> argparse.ArgumentParser:
696756
models_parser = subparsers.add_parser("models", help="List available models for benchmarking")
697757
models_parser.add_argument(
698758
"-p", "--provider",
699-
choices=["ollama", "openrouter"],
759+
choices=["ollama", "openai", "openrouter"],
700760
default="ollama",
701761
help="Provider to list models for (default: ollama)"
702762
)
763+
models_parser.add_argument(
764+
"--openai-key",
765+
help="API key for listing models from an OpenAI-compatible endpoint"
766+
)
767+
models_parser.add_argument(
768+
"--openai-endpoint",
769+
help="OpenAI-compatible endpoint to query for available models"
770+
)
703771
models_parser.add_argument(
704772
"--openrouter-key",
705773
help="OpenRouter API key (required for listing OpenRouter models)"

benchmark/config.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Benchmark configuration module.
33
44
Defines configuration settings for the benchmark system including:
5-
- Ollama settings for translation
5+
- Ollama/OpenAI-compatible settings for translation
66
- OpenRouter settings for evaluation
77
- File paths and defaults
88
"""
@@ -99,6 +99,30 @@ class OpenRouterConfig:
9999
site_name: str = "TranslateBookWithLLM Benchmark"
100100

101101

102+
@dataclass
103+
class OpenAICompatibleConfig:
104+
"""Configuration for OpenAI-compatible translation provider."""
105+
106+
api_key: Optional[str] = field(
107+
default_factory=lambda: os.getenv("OPENAI_API_KEY")
108+
)
109+
endpoint: str = field(
110+
default_factory=lambda: os.getenv(
111+
"OPENAI_API_ENDPOINT",
112+
"https://api.openai.com/v1/chat/completions"
113+
)
114+
)
115+
default_model: str = field(
116+
default_factory=lambda: os.getenv("OPENAI_MODEL", "gpt-4o-mini")
117+
)
118+
context_window: int = field(
119+
default_factory=lambda: int(os.getenv("OPENAI_NUM_CTX", os.getenv("OLLAMA_NUM_CTX", "2048")))
120+
)
121+
timeout: int = field(
122+
default_factory=lambda: int(os.getenv("OPENAI_REQUEST_TIMEOUT", os.getenv("REQUEST_TIMEOUT", "900")))
123+
)
124+
125+
102126
@dataclass
103127
class PoeConfig:
104128
"""Configuration for Poe evaluation provider."""
@@ -155,6 +179,7 @@ class BenchmarkConfig:
155179
"""Main benchmark configuration aggregating all sub-configs."""
156180

157181
ollama: OllamaConfig = field(default_factory=OllamaConfig)
182+
openai: OpenAICompatibleConfig = field(default_factory=OpenAICompatibleConfig)
158183
openrouter: OpenRouterConfig = field(default_factory=OpenRouterConfig)
159184
poe: PoeConfig = field(default_factory=PoeConfig)
160185
paths: PathConfig = field(default_factory=PathConfig)
@@ -163,7 +188,7 @@ class BenchmarkConfig:
163188
source_language: str = "English"
164189
quick_languages: list = field(default_factory=lambda: DEFAULT_QUICK_LANGUAGES.copy())
165190

166-
# Translation provider ("ollama" or "openrouter")
191+
# Translation provider ("ollama", "openai", or "openrouter")
167192
translation_provider: str = "ollama"
168193

169194
# Evaluator provider ("openrouter" or "poe")
@@ -182,6 +207,8 @@ def from_env(cls) -> "BenchmarkConfig":
182207
def from_cli_args(
183208
cls,
184209
openrouter_key: Optional[str] = None,
210+
openai_key: Optional[str] = None,
211+
openai_endpoint: Optional[str] = None,
185212
evaluator_model: Optional[str] = None,
186213
ollama_endpoint: Optional[str] = None,
187214
translation_provider: Optional[str] = None,
@@ -195,6 +222,9 @@ def from_cli_args(
195222
if openrouter_key:
196223
config.openrouter.api_key = openrouter_key
197224

225+
if openai_key:
226+
config.openai.api_key = openai_key
227+
198228
if poe_key:
199229
config.poe.api_key = poe_key
200230

@@ -205,6 +235,9 @@ def from_cli_args(
205235
if ollama_endpoint:
206236
config.ollama.endpoint = ollama_endpoint
207237

238+
if openai_endpoint:
239+
config.openai.endpoint = openai_endpoint
240+
208241
if translation_provider:
209242
config.translation_provider = translation_provider.lower()
210243

@@ -245,17 +278,23 @@ def validate(self) -> list[str]:
245278
"Set OPENROUTER_API_KEY in .env or use --openrouter-key"
246279
)
247280

281+
if self.translation_provider == "openai" and not self.openai.endpoint:
282+
errors.append(
283+
"OpenAI-compatible endpoint not configured. Required for translation. "
284+
"Set OPENAI_API_ENDPOINT in .env or use --openai-endpoint"
285+
)
286+
248287
if not self.paths.languages_file.exists():
249288
errors.append(f"Languages file not found: {self.paths.languages_file}")
250289

251290
if not self.paths.reference_texts_file.exists():
252291
errors.append(f"Reference texts file not found: {self.paths.reference_texts_file}")
253292

254293
# Validate translation provider
255-
if self.translation_provider not in ("ollama", "openrouter"):
294+
if self.translation_provider not in ("ollama", "openai", "openrouter"):
256295
errors.append(
257296
f"Invalid translation provider: {self.translation_provider}. "
258-
"Must be 'ollama' or 'openrouter'"
297+
"Must be 'ollama', 'openai', or 'openrouter'"
259298
)
260299

261300
return errors

benchmark/runner.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
Coordinates the complete benchmark workflow:
55
1. Load languages and reference texts
6-
2. Run translations with specified Ollama models
6+
2. Run translations with specified provider models
77
3. Evaluate translations with OpenRouter
88
4. Track progress and handle resumption
99
5. Generate results
@@ -25,6 +25,7 @@
2525
from benchmark.translator import (
2626
BenchmarkTranslator, TranslationRequest,
2727
test_ollama_connection, get_available_ollama_models,
28+
test_openai_translation_connection, get_available_openai_models,
2829
test_openrouter_translation_connection, get_available_openrouter_models
2930
)
3031
from benchmark.evaluator import (
@@ -206,6 +207,12 @@ async def validate_setup(self) -> tuple[bool, list[str]]:
206207
errors.append(f"OpenRouter (translation): {or_trans_msg}")
207208
else:
208209
self._log("info", f"OpenRouter (translation): {or_trans_msg}")
210+
elif self.config.translation_provider == "openai":
211+
openai_ok, openai_msg = await test_openai_translation_connection(self.config)
212+
if not openai_ok:
213+
errors.append(f"OpenAI-compatible (translation): {openai_msg}")
214+
else:
215+
self._log("info", f"OpenAI-compatible (translation): {openai_msg}")
209216
else:
210217
# Test Ollama connection
211218
ollama_ok, ollama_msg = await test_ollama_connection(self.config)
@@ -241,7 +248,7 @@ def _generate_jobs(
241248
Generate translation jobs, skipping already completed ones.
242249
243250
Args:
244-
models: List of Ollama model names
251+
models: List of provider model names
245252
languages: List of target languages
246253
texts: List of reference texts
247254
existing_results: Results from a previous run (for resumption)
@@ -280,7 +287,7 @@ async def run(
280287
Execute a complete benchmark run.
281288
282289
Args:
283-
models: List of Ollama model names to benchmark
290+
models: List of provider model names to benchmark
284291
language_codes: Language codes to test (None = quick test set)
285292
resume_run: Optional previous run to resume
286293
@@ -432,7 +439,7 @@ async def quick_benchmark(
432439
433440
Args:
434441
config: Benchmark configuration
435-
models: Optional list of models (defaults to available Ollama models)
442+
models: Optional list of models (defaults to auto-detected provider models)
436443
log_callback: Optional logging callback
437444
438445
Returns:
@@ -447,9 +454,16 @@ async def quick_benchmark(
447454

448455
# Get models if not specified
449456
if models is None:
450-
models = await get_available_ollama_models(config)
457+
if config.translation_provider == "openrouter":
458+
provider_models = await get_available_openrouter_models(config)
459+
models = [m["id"] if isinstance(m, dict) else m for m in provider_models]
460+
elif config.translation_provider == "openai":
461+
provider_models = await get_available_openai_models(config)
462+
models = [m["id"] if isinstance(m, dict) else m for m in provider_models]
463+
else:
464+
models = await get_available_ollama_models(config)
451465
if not models:
452-
raise RuntimeError("No Ollama models available")
466+
raise RuntimeError(f"No {config.translation_provider} models available")
453467
# Limit to first 3 models for quick benchmark
454468
models = models[:3]
455469

@@ -466,7 +480,7 @@ async def full_benchmark(
466480
467481
Args:
468482
config: Benchmark configuration
469-
models: List of Ollama models to benchmark
483+
models: List of provider models to benchmark
470484
log_callback: Optional logging callback
471485
472486
Returns:

0 commit comments

Comments
 (0)