Skip to content

Commit e72f5c9

Browse files
authored
Merge pull request #227 from togethercomputer/sbassam/update-rl-specs
MOSH-1979: Sync RL API OpenAPI spec with latest shaping protos to incorporate training checkpoint saving/resuming
2 parents 08c075d + 4642360 commit e72f5c9

1 file changed

Lines changed: 111 additions & 3 deletions

File tree

openapi.yaml

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7493,6 +7493,64 @@ paths:
74937493
schema:
74947494
description: Operation ID
74957495
type: string
7496+
/rl/training-sessions/{session_id}/operations/training-checkpoint:
7497+
post:
7498+
summary: Save training checkpoint
7499+
description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step).
7500+
operationId: createTrainingCheckpoint
7501+
tags: [RL]
7502+
responses:
7503+
"200":
7504+
description: Save training checkpoint operation details
7505+
content:
7506+
application/json:
7507+
schema:
7508+
$ref: '#/components/schemas/RL.TrainingCheckpointOperation'
7509+
default:
7510+
description: An unexpected error response.
7511+
content:
7512+
application/json:
7513+
schema:
7514+
$ref: '#/components/schemas/ErrorData'
7515+
parameters:
7516+
- name: session_id
7517+
in: path
7518+
required: true
7519+
schema:
7520+
description: Training session ID
7521+
type: string
7522+
/rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id}:
7523+
get:
7524+
summary: Get save training checkpoint operation
7525+
description: Retrieves the current status and result of a save training checkpoint operation.
7526+
operationId: getTrainingCheckpointOperation
7527+
tags: [RL]
7528+
responses:
7529+
"200":
7530+
description: Save training checkpoint operation details
7531+
content:
7532+
application/json:
7533+
schema:
7534+
$ref: '#/components/schemas/RL.TrainingCheckpointOperation'
7535+
default:
7536+
description: An unexpected error response.
7537+
content:
7538+
application/json:
7539+
schema:
7540+
$ref: '#/components/schemas/ErrorData'
7541+
parameters:
7542+
- name: session_id
7543+
in: path
7544+
required: true
7545+
schema:
7546+
description: Training session ID
7547+
type: string
7548+
- name: operation_id
7549+
in: path
7550+
required: true
7551+
schema:
7552+
description: Operation ID
7553+
type: string
74967554
/rl/checkpoints/{id}/download:
74977555
get:
74987556
summary: Download checkpoint
@@ -7977,10 +8035,10 @@ components:
79778035
description: Base model to use for the training session
79788036
type: string
79798037
example: meta-llama/Meta-Llama-3-8B-Instruct
7980-
checkpoint_id:
7981-
description: Checkpoint ID to use for the training session
8038+
resume_from_checkpoint_id:
8039+
description: Checkpoint ID to resume from
79828040
type: string
7983-
example: checkpoint-123
8041+
example: 123e4567-e89b-12d3-a456-426614174000
79848042
lora_config:
79858043
$ref: '#/components/schemas/RL.LoraConfig'
79868044
RL.TrainingSessionStatus:
@@ -8016,6 +8074,16 @@ components:
80168074
type: object
80178075
$ref: '#/components/schemas/RL.InferenceCheckpoint'
80188076
description: List of saved inference checkpoints for this session
8077+
training_checkpoints:
8078+
type: array
8079+
items:
8080+
type: object
8081+
$ref: '#/components/schemas/RL.TrainingCheckpoint'
8082+
description: List of saved training checkpoints for this session
8083+
resume_from_checkpoint_id:
8084+
type: string
8085+
example: 123e4567-e89b-12d3-a456-426614174000
8086+
description: Checkpoint ID this session was resumed from
80198087
step:
80208088
description: Current training step
80218089
type: string
@@ -8152,6 +8220,46 @@ components:
81528220
format: date-time
81538221
example: "2026-01-02T00:00:00Z"
81548222
description: Timestamp when the model was registered
8223+
RL.TrainingCheckpoint:
8224+
type: object
8225+
description: Saved training checkpoint
8226+
properties:
8227+
id:
8228+
type: string
8229+
example: 123e4567-e89b-12d3-a456-426614174000
8230+
description: Unique identifier for the checkpoint
8231+
step:
8232+
type: string
8233+
format: uint64
8234+
example: 42
8235+
description: Training step at time of save
8236+
created_at:
8237+
type: string
8238+
format: date-time
8239+
example: "2026-01-02T00:00:00Z"
8240+
description: Timestamp when the checkpoint was created
8241+
RL.TrainingCheckpointResult:
8242+
type: object
8243+
properties:
8244+
checkpoint_id:
8245+
type: string
8246+
example: 550e8400-e29b-41d4-a716-446655440000
8247+
description: ID of the saved training checkpoint (use for resume via Start)
8248+
RL.TrainingCheckpointOperation:
8249+
type: object
8250+
properties:
8251+
id:
8252+
type: string
8253+
example: 550e8400-e29b-41d4-a716-446655440000
8254+
description: Operation ID
8255+
status:
8256+
$ref: '#/components/schemas/RL.TrainingOperationStatus'
8257+
example: TRAINING_OPERATION_STATUS_PENDING
8258+
description: Operation status
8259+
output:
8260+
$ref: '#/components/schemas/RL.TrainingCheckpointResult'
8261+
error:
8262+
$ref: '#/components/schemas/RL.TrainingOperationError'
81558263
RL.CheckpointVariant:
81568264
type: string
81578265
enum:

0 commit comments

Comments
 (0)