Skip to content

Commit 673be25

Browse files
committed
update specs for rl training checkpoint save/resume
1 parent 152ef85 commit 673be25

1 file changed

Lines changed: 111 additions & 3 deletions

File tree

openapi.yaml

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7492,6 +7492,64 @@ paths:
74927492
schema:
74937493
description: Operation ID
74947494
type: string
7495+
/rl/training-sessions/{session_id}/operations/training-checkpoint:
7496+
post:
7497+
summary: Save training checkpoint
7498+
description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step) to object storage.
7499+
operationId: createTrainingCheckpoint
7500+
tags: [RL]
7501+
responses:
7502+
"200":
7503+
description: Save training checkpoint operation details
7504+
content:
7505+
application/json:
7506+
schema:
7507+
$ref: '#/components/schemas/RL.TrainingCheckpointOperation'
7508+
default:
7509+
description: An unexpected error response.
7510+
content:
7511+
application/json:
7512+
schema:
7513+
$ref: '#/components/schemas/ErrorData'
7514+
parameters:
7515+
- name: session_id
7516+
in: path
7517+
required: true
7518+
schema:
7519+
description: Training session ID
7520+
type: string
7521+
/rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id}:
7522+
get:
7523+
summary: Get save training checkpoint operation
7524+
description: Retrieves the current status and result of a save training checkpoint operation.
7525+
operationId: getTrainingCheckpointOperation
7526+
tags: [RL]
7527+
responses:
7528+
"200":
7529+
description: Save training checkpoint operation details
7530+
content:
7531+
application/json:
7532+
schema:
7533+
$ref: '#/components/schemas/RL.TrainingCheckpointOperation'
7534+
default:
7535+
description: An unexpected error response.
7536+
content:
7537+
application/json:
7538+
schema:
7539+
$ref: '#/components/schemas/ErrorData'
7540+
parameters:
7541+
- name: session_id
7542+
in: path
7543+
required: true
7544+
schema:
7545+
description: Training session ID
7546+
type: string
7547+
- name: operation_id
7548+
in: path
7549+
required: true
7550+
schema:
7551+
description: Operation ID
7552+
type: string
74957553
/rl/checkpoints/{id}/download:
74967554
get:
74977555
summary: Download checkpoint
@@ -7976,10 +8034,10 @@ components:
79768034
description: Base model to use for the training session
79778035
type: string
79788036
example: meta-llama/Meta-Llama-3-8B-Instruct
7979-
checkpoint_id:
7980-
description: Checkpoint ID to use for the training session
8037+
resume_from_checkpoint_id:
8038+
description: Checkpoint ID to resume from
79818039
type: string
7982-
example: checkpoint-123
8040+
example: 123e4567-e89b-12d3-a456-426614174000
79838041
lora_config:
79848042
$ref: '#/components/schemas/RL.LoraConfig'
79858043
RL.TrainingSessionStatus:
@@ -8015,6 +8073,16 @@ components:
80158073
type: object
80168074
$ref: '#/components/schemas/RL.InferenceCheckpoint'
80178075
description: List of saved inference checkpoints for this session
8076+
training_checkpoints:
8077+
type: array
8078+
items:
8079+
type: object
8080+
$ref: '#/components/schemas/RL.TrainingCheckpoint'
8081+
description: List of saved training checkpoints for this session
8082+
resume_from_checkpoint_id:
8083+
type: string
8084+
example: 123e4567-e89b-12d3-a456-426614174000
8085+
description: Checkpoint ID this session was resumed from
80188086
step:
80198087
description: Current training step
80208088
type: string
@@ -8151,6 +8219,46 @@ components:
81518219
format: date-time
81528220
example: "2026-01-02T00:00:00Z"
81538221
description: Timestamp when the model was registered
8222+
RL.TrainingCheckpoint:
8223+
type: object
8224+
description: Saved training checkpoint
8225+
properties:
8226+
id:
8227+
type: string
8228+
example: 123e4567-e89b-12d3-a456-426614174000
8229+
description: Unique identifier for the checkpoint
8230+
step:
8231+
type: string
8232+
format: uint64
8233+
example: 42
8234+
description: Training step at time of save
8235+
created_at:
8236+
type: string
8237+
format: date-time
8238+
example: "2026-01-02T00:00:00Z"
8239+
description: Timestamp when the checkpoint was created
8240+
RL.TrainingCheckpointResult:
8241+
type: object
8242+
properties:
8243+
checkpoint_id:
8244+
type: string
8245+
example: 550e8400-e29b-41d4-a716-446655440000
8246+
description: ID of the saved training checkpoint (use for resume via Start)
8247+
RL.TrainingCheckpointOperation:
8248+
type: object
8249+
properties:
8250+
id:
8251+
type: string
8252+
example: 550e8400-e29b-41d4-a716-446655440000
8253+
description: Operation ID
8254+
status:
8255+
$ref: '#/components/schemas/RL.TrainingOperationStatus'
8256+
example: TRAINING_OPERATION_STATUS_PENDING
8257+
description: Operation status
8258+
output:
8259+
$ref: '#/components/schemas/RL.TrainingCheckpointResult'
8260+
error:
8261+
$ref: '#/components/schemas/RL.TrainingOperationError'
81548262
RL.CheckpointVariant:
81558263
type: string
81568264
enum:

0 commit comments

Comments
 (0)