@@ -7493,6 +7493,64 @@ paths:
74937493 schema :
74947494 description : Operation ID
74957495 type : string
7496+ /rl/training-sessions/{session_id}/operations/training-checkpoint :
7497+ post :
7498+ summary : Save training checkpoint
7499+ description : Submits an operation that will asynchronously save the full training state (adapter + optimizer + step).
7500+ operationId : createTrainingCheckpoint
7501+ tags : [RL]
7502+ responses :
7503+ " 200 " :
7504+ description : Save training checkpoint operation details
7505+ content :
7506+ application/json :
7507+ schema :
7508+ $ref : ' #/components/schemas/RL.TrainingCheckpointOperation'
7509+ default :
7510+ description : An unexpected error response.
7511+ content :
7512+ application/json :
7513+ schema :
7514+ $ref : ' #/components/schemas/ErrorData'
7515+ parameters :
7516+ - name : session_id
7517+ in : path
7518+ required : true
7519+ schema :
7520+ description : Training session ID
7521+ type : string
7522+ /rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id} :
7523+ get :
7524+ summary : Get save training checkpoint operation
7525+ description : Retrieves the current status and result of a save training checkpoint operation.
7526+ operationId : getTrainingCheckpointOperation
7527+ tags : [RL]
7528+ responses :
7529+ " 200 " :
7530+ description : Save training checkpoint operation details
7531+ content :
7532+ application/json :
7533+ schema :
7534+ $ref : ' #/components/schemas/RL.TrainingCheckpointOperation'
7535+ default :
7536+ description : An unexpected error response.
7537+ content :
7538+ application/json :
7539+ schema :
7540+ $ref : ' #/components/schemas/ErrorData'
7541+ parameters :
7542+ - name : session_id
7543+ in : path
7544+ required : true
7545+ schema :
7546+ description : Training session ID
7547+ type : string
7548+ - name : operation_id
7549+ in : path
7550+ required : true
7551+ schema :
7552+ description : Operation ID
7553+ type : string
74967554 /rl/checkpoints/{id}/download :
74977555 get :
74987556 summary : Download checkpoint
@@ -7977,10 +8035,10 @@ components:
79778035 description : Base model to use for the training session
79788036 type : string
79798037 example : meta-llama/Meta-Llama-3-8B-Instruct
7980- checkpoint_id :
7981- description : Checkpoint ID to use for the training session
8038+ resume_from_checkpoint_id :
8039+ description : Checkpoint ID to resume from
79828040 type : string
7983- example : checkpoint-123
8041+ example : 123e4567-e89b-12d3-a456-426614174000
79848042 lora_config :
79858043 $ref : ' #/components/schemas/RL.LoraConfig'
79868044 RL.TrainingSessionStatus :
@@ -8016,6 +8074,16 @@ components:
80168074 type : object
80178075 $ref : ' #/components/schemas/RL.InferenceCheckpoint'
80188076 description : List of saved inference checkpoints for this session
8077+ training_checkpoints :
8078+ type : array
8079+ items :
8080+ type : object
8081+ $ref : ' #/components/schemas/RL.TrainingCheckpoint'
8082+ description : List of saved training checkpoints for this session
8083+ resume_from_checkpoint_id :
8084+ type : string
8085+ example : 123e4567-e89b-12d3-a456-426614174000
8086+ description : Checkpoint ID this session was resumed from
80198087 step :
80208088 description : Current training step
80218089 type : string
@@ -8152,6 +8220,46 @@ components:
81528220 format : date-time
81538221 example : " 2026-01-02T00:00:00Z"
81548222 description : Timestamp when the model was registered
8223+ RL.TrainingCheckpoint :
8224+ type : object
8225+ description : Saved training checkpoint
8226+ properties :
8227+ id :
8228+ type : string
8229+ example : 123e4567-e89b-12d3-a456-426614174000
8230+ description : Unique identifier for the checkpoint
8231+ step :
8232+ type : string
8233+ format : uint64
8234+ example : 42
8235+ description : Training step at time of save
8236+ created_at :
8237+ type : string
8238+ format : date-time
8239+ example : " 2026-01-02T00:00:00Z"
8240+ description : Timestamp when the checkpoint was created
8241+ RL.TrainingCheckpointResult :
8242+ type : object
8243+ properties :
8244+ checkpoint_id :
8245+ type : string
8246+ example : 550e8400-e29b-41d4-a716-446655440000
8247+ description : ID of the saved training checkpoint (use for resume via Start)
8248+ RL.TrainingCheckpointOperation :
8249+ type : object
8250+ properties :
8251+ id :
8252+ type : string
8253+ example : 550e8400-e29b-41d4-a716-446655440000
8254+ description : Operation ID
8255+ status :
8256+ $ref : ' #/components/schemas/RL.TrainingOperationStatus'
8257+ example : TRAINING_OPERATION_STATUS_PENDING
8258+ description : Operation status
8259+ output :
8260+ $ref : ' #/components/schemas/RL.TrainingCheckpointResult'
8261+ error :
8262+ $ref : ' #/components/schemas/RL.TrainingOperationError'
81558263 RL.CheckpointVariant :
81568264 type : string
81578265 enum :
0 commit comments