@@ -7492,6 +7492,64 @@ paths:
74927492 schema :
74937493 description : Operation ID
74947494 type : string
7495+ /rl/training-sessions/{session_id}/operations/training-checkpoint :
7496+ post :
7497+ summary : Save training checkpoint
7498+ description : Submits an operation that will asynchronously save the full training state (adapter + optimizer + step) to object storage.
7499+ operationId : createTrainingCheckpoint
7500+ tags : [RL]
7501+ responses :
7502+ " 200 " :
7503+ description : Save training checkpoint operation details
7504+ content :
7505+ application/json :
7506+ schema :
7507+ $ref : ' #/components/schemas/RL.TrainingCheckpointOperation'
7508+ default :
7509+ description : An unexpected error response.
7510+ content :
7511+ application/json :
7512+ schema :
7513+ $ref : ' #/components/schemas/ErrorData'
7514+ parameters :
7515+ - name : session_id
7516+ in : path
7517+ required : true
7518+ schema :
7519+ description : Training session ID
7520+ type : string
7521+ /rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id} :
7522+ get :
7523+ summary : Get save training checkpoint operation
7524+ description : Retrieves the current status and result of a save training checkpoint operation.
7525+ operationId : getTrainingCheckpointOperation
7526+ tags : [RL]
7527+ responses :
7528+ " 200 " :
7529+ description : Save training checkpoint operation details
7530+ content :
7531+ application/json :
7532+ schema :
7533+ $ref : ' #/components/schemas/RL.TrainingCheckpointOperation'
7534+ default :
7535+ description : An unexpected error response.
7536+ content :
7537+ application/json :
7538+ schema :
7539+ $ref : ' #/components/schemas/ErrorData'
7540+ parameters :
7541+ - name : session_id
7542+ in : path
7543+ required : true
7544+ schema :
7545+ description : Training session ID
7546+ type : string
7547+ - name : operation_id
7548+ in : path
7549+ required : true
7550+ schema :
7551+ description : Operation ID
7552+ type : string
74957553 /rl/checkpoints/{id}/download :
74967554 get :
74977555 summary : Download checkpoint
@@ -7976,10 +8034,10 @@ components:
79768034 description : Base model to use for the training session
79778035 type : string
79788036 example : meta-llama/Meta-Llama-3-8B-Instruct
7979- checkpoint_id :
7980- description : Checkpoint ID to use for the training session
8037+ resume_from_checkpoint_id :
8038+ description : Checkpoint ID to resume from
79818039 type : string
7982- example : checkpoint-123
8040+ example : 123e4567-e89b-12d3-a456-426614174000
79838041 lora_config :
79848042 $ref : ' #/components/schemas/RL.LoraConfig'
79858043 RL.TrainingSessionStatus :
@@ -8015,6 +8073,16 @@ components:
80158073 type : object
80168074 $ref : ' #/components/schemas/RL.InferenceCheckpoint'
80178075 description : List of saved inference checkpoints for this session
8076+ training_checkpoints :
8077+ type : array
8078+ items :
8079+ type : object
8080+ $ref : ' #/components/schemas/RL.TrainingCheckpoint'
8081+ description : List of saved training checkpoints for this session
8082+ resume_from_checkpoint_id :
8083+ type : string
8084+ example : 123e4567-e89b-12d3-a456-426614174000
8085+ description : Checkpoint ID this session was resumed from
80188086 step :
80198087 description : Current training step
80208088 type : string
@@ -8151,6 +8219,46 @@ components:
81518219 format : date-time
81528220 example : " 2026-01-02T00:00:00Z"
81538221 description : Timestamp when the model was registered
8222+ RL.TrainingCheckpoint :
8223+ type : object
8224+ description : Saved training checkpoint
8225+ properties :
8226+ id :
8227+ type : string
8228+ example : 123e4567-e89b-12d3-a456-426614174000
8229+ description : Unique identifier for the checkpoint
8230+ step :
8231+ type : string
8232+ format : uint64
8233+ example : 42
8234+ description : Training step at time of save
8235+ created_at :
8236+ type : string
8237+ format : date-time
8238+ example : " 2026-01-02T00:00:00Z"
8239+ description : Timestamp when the checkpoint was created
8240+ RL.TrainingCheckpointResult :
8241+ type : object
8242+ properties :
8243+ checkpoint_id :
8244+ type : string
8245+ example : 550e8400-e29b-41d4-a716-446655440000
8246+ description : ID of the saved training checkpoint (use for resume via Start)
8247+ RL.TrainingCheckpointOperation :
8248+ type : object
8249+ properties :
8250+ id :
8251+ type : string
8252+ example : 550e8400-e29b-41d4-a716-446655440000
8253+ description : Operation ID
8254+ status :
8255+ $ref : ' #/components/schemas/RL.TrainingOperationStatus'
8256+ example : TRAINING_OPERATION_STATUS_PENDING
8257+ description : Operation status
8258+ output :
8259+ $ref : ' #/components/schemas/RL.TrainingCheckpointResult'
8260+ error :
8261+ $ref : ' #/components/schemas/RL.TrainingOperationError'
81548262 RL.CheckpointVariant :
81558263 type : string
81568264 enum :
0 commit comments