@@ -67,6 +67,14 @@ static tq_type parse_kv_type(const char* s) {
6767 return TQ_TYPE_UNIFORM_4B ;
6868}
6969
70+ #define QUANT_VERSION "0.2.0"
71+
72+ static void print_version (void ) {
73+ printf ("quant.cpp v%s\n" , QUANT_VERSION );
74+ printf ("Embeddable LLM inference in pure C\n" );
75+ printf ("https://github.com/quantumaikr/quant.cpp\n" );
76+ }
77+
7078static void print_usage (const char * prog ) {
7179 fprintf (stderr , "quant — Minimal C inference engine. Zero dependencies.\n" );
7280 fprintf (stderr , "Usage: %s <model.safetensors> [options]\n\n" , prog );
@@ -97,6 +105,8 @@ static void print_usage(const char* prog) {
97105 fprintf (stderr , " --ctx <N> Override max context length (default: 4096)\n" );
98106 fprintf (stderr , " --delta, -D Enable delta KV compression (store key deltas)\n" );
99107 fprintf (stderr , " --k-window <N> Age-based K: recent N tokens FP32, rest quantized\n" );
108+ fprintf (stderr , " --version Print version and exit\n" );
109+ fprintf (stderr , " --json JSON output for --ppl (machine-parseable)\n" );
100110}
101111
102112int main (int argc , char * * argv ) {
@@ -130,6 +140,7 @@ int main(int argc, char** argv) {
130140 int delta_kv = 0 ; /* 1 = delta KV compression (store key deltas) */
131141 int delta_iframe_int = 0 ; /* I-frame interval for delta KV (0 = auto = 64) */
132142 int k_highres_window = 0 ; /* age-based: recent N keys at FP32, rest at 2-bit */
143+ int json_output = 0 ; /* 1 = JSON output for --ppl */
133144
134145 for (int i = 1 ; i < argc ; i ++ ) {
135146 if (argv [i ][0 ] != '-' ) {
@@ -214,6 +225,11 @@ int main(int argc, char** argv) {
214225 delta_iframe_int = atoi (argv [++ i ]);
215226 } else if (strcmp (argv [i ], "--k-window" ) == 0 && i + 1 < argc ) {
216227 k_highres_window = atoi (argv [++ i ]);
228+ } else if (strcmp (argv [i ], "--version" ) == 0 ) {
229+ print_version ();
230+ return 0 ;
231+ } else if (strcmp (argv [i ], "--json" ) == 0 ) {
232+ json_output = 1 ;
217233 } else if (strcmp (argv [i ], "-h" ) == 0 || strcmp (argv [i ], "--help" ) == 0 ) {
218234 print_usage (argv [0 ]);
219235 return 0 ;
@@ -452,6 +468,25 @@ int main(int argc, char** argv) {
452468 /* Machine-parseable */
453469 fprintf (stderr , "PPL_CSV:%d,%.6f,%.4f\n" , n_eval , avg_nll , perplexity );
454470
471+ /* JSON output (--json flag) */
472+ if (json_output ) {
473+ const char * kv_name = kv_type < TQ_TYPE_COUNT ? tq_type_name (kv_type ) : "fp32" ;
474+ const char * v_name = value_quant_bits == 4 ? "q4" : (value_quant_bits == 2 ? "q2" : "fp16" );
475+ printf ("{\n" );
476+ printf (" \"model\": \"%s\",\n" , model_path );
477+ printf (" \"benchmark\": \"%s\",\n" , ppl_file );
478+ printf (" \"tokens\": %d,\n" , n_tokens );
479+ printf (" \"tokens_evaluated\": %d,\n" , n_eval );
480+ printf (" \"kv_type\": \"%s\",\n" , kv_name );
481+ printf (" \"v_quant\": \"%s\",\n" , v_name );
482+ printf (" \"delta_kv\": %s,\n" , delta_kv ? "true" : "false" );
483+ printf (" \"perplexity\": %.4f,\n" , perplexity );
484+ printf (" \"avg_nll\": %.6f,\n" , avg_nll );
485+ printf (" \"elapsed_s\": %.2f,\n" , ppl_elapsed );
486+ printf (" \"tok_per_s\": %.1f\n" , (double )n_eval / ppl_elapsed );
487+ printf ("}\n" );
488+ }
489+
455490 tq_free_state (state );
456491 free (tokens );
457492 tq_free_tokenizer (tok );
0 commit comments