@@ -532,9 +532,10 @@ int main(int argc, char** argv) {
532532 "quant-server-unified — OpenAI-compatible server (quant.h unified build)\n\n"
533533 "Usage: %s <model.gguf> [options]\n\n"
534534 "Options:\n"
535- " -p <port> Listen port (default: 8080)\n"
536- " -j <threads> Threads per inference (default: 4)\n"
537- " --help Show this help\n\n"
535+ " -p <port> Listen port (default: 8080)\n"
536+ " -j <threads> Threads per inference (default: 4)\n"
537+ " --template T Chat template: chatml (default), phi3, gemma\n"
538+ " --help Show this help\n\n"
538539 "Example:\n"
539540 " %s model.gguf -p 8080 -j 8\n"
540541 " curl http://localhost:8080/v1/chat/completions \\\n"
@@ -584,18 +585,37 @@ int main(int argc, char** argv) {
584585 return 1 ;
585586 }
586587
587- /* Detect model architecture for chat template selection.
588- * Check model filename for architecture hints. */
589- int template_type = TMPL_CHATML ; /* default */
588+ /* Detect chat template from filename or --template flag.
589+ * Supports: chatml (default), phi3, gemma.
590+ * #86: auto-detection covers Phi-3/3.5/4, Gemma 2/3/4. */
591+ int template_type = TMPL_CHATML ;
590592 const char * bn = strrchr (model_path , '/' );
591593 bn = bn ? bn + 1 : model_path ;
592- if (strstr (bn , "hi-3" ) || strstr (bn , "hi3" ) || strstr (bn , "Hi-3" ) || strstr (bn , "Hi3" ) ||
593- strstr (bn , "phi-3" ) || strstr (bn , "phi3" ) || strstr (bn , "Phi-3" ) || strstr (bn , "Phi3" )) {
594- template_type = TMPL_PHI3 ;
595- fprintf (stderr , "Detected Phi-3 model — using Phi-3 chat template\n" );
596- } else if (strstr (bn , "gemma" ) || strstr (bn , "Gemma" )) {
597- template_type = TMPL_GEMMA ;
598- fprintf (stderr , "Detected Gemma model — using Gemma chat template\n" );
594+
595+ /* Check --template CLI override first */
596+ for (int i = 2 ; i < argc ; i ++ ) {
597+ if (strcmp (argv [i ], "--template" ) == 0 && i + 1 < argc ) {
598+ const char * t = argv [++ i ];
599+ if (strcmp (t , "phi3" ) == 0 ) template_type = TMPL_PHI3 ;
600+ else if (strcmp (t , "gemma" ) == 0 ) template_type = TMPL_GEMMA ;
601+ else if (strcmp (t , "chatml" ) == 0 ) template_type = TMPL_CHATML ;
602+ fprintf (stderr , "Chat template: %s (--template override)\n" , t );
603+ }
604+ }
605+
606+ /* Auto-detect from filename if no override */
607+ if (template_type == TMPL_CHATML ) {
608+ /* Phi family: Phi-3, Phi-3.5, Phi-4 all use <|user|>...<|end|> */
609+ if (strstr (bn , "phi-3" ) || strstr (bn , "phi3" ) || strstr (bn , "Phi-3" ) || strstr (bn , "Phi3" ) ||
610+ strstr (bn , "phi-4" ) || strstr (bn , "phi4" ) || strstr (bn , "Phi-4" ) || strstr (bn , "Phi4" )) {
611+ template_type = TMPL_PHI3 ;
612+ fprintf (stderr , "Detected Phi model — using Phi chat template\n" );
613+ }
614+ /* Gemma family */
615+ else if (strstr (bn , "gemma" ) || strstr (bn , "Gemma" )) {
616+ template_type = TMPL_GEMMA ;
617+ fprintf (stderr , "Detected Gemma model — using Gemma chat template\n" );
618+ }
599619 }
600620 int has_fused_qkv = (template_type == TMPL_PHI3 ) ? 1 : 0 ;
601621
0 commit comments