@@ -23,20 +23,22 @@ import (
2323// callbackWorkUnitFactory wraps a Worker to implement workUnitFactory.
2424type callbackWorkUnitFactory struct {
2525 Callback func (ctx context.Context , jobRow * rivertype.JobRow ) error
26+ timeout time.Duration // defaults to 0, which signals default timeout
2627}
2728
2829func (w * callbackWorkUnitFactory ) MakeUnit (jobRow * rivertype.JobRow ) workunit.WorkUnit {
29- return & callbackWorkUnit {callback : w .Callback , jobRow : jobRow }
30+ return & callbackWorkUnit {callback : w .Callback , jobRow : jobRow , timeout : w . timeout }
3031}
3132
3233// callbackWorkUnit implements workUnit for a job and Worker.
3334type callbackWorkUnit struct {
3435 callback func (ctx context.Context , jobRow * rivertype.JobRow ) error
3536 jobRow * rivertype.JobRow
37+ timeout time.Duration // defaults to 0, which signals default timeout
3638}
3739
3840func (w * callbackWorkUnit ) NextRetry () time.Time { return time .Now ().Add (30 * time .Second ) }
39- func (w * callbackWorkUnit ) Timeout () time.Duration { return 0 }
41+ func (w * callbackWorkUnit ) Timeout () time.Duration { return w . timeout }
4042func (w * callbackWorkUnit ) Work (ctx context.Context ) error { return w .callback (ctx , w .jobRow ) }
4143func (w * callbackWorkUnit ) UnmarshalJob () error { return nil }
4244
@@ -51,10 +53,13 @@ func (p *SimpleClientRetryPolicy) NextRetry(job *rivertype.JobRow) time.Time {
5153func TestJobRescuer (t * testing.T ) {
5254 t .Parallel ()
5355
54- const rescuerJobKind = "rescuer"
55-
5656 ctx := context .Background ()
5757
58+ const (
59+ rescuerJobKind = "rescuer"
60+ rescuerJobKindLongTimeout = "rescuer_long_timeout"
61+ )
62+
5863 type testBundle struct {
5964 exec riverdriver.Executor
6065 rescueHorizon time.Time
@@ -76,8 +81,13 @@ func TestJobRescuer(t *testing.T) {
7681 Interval : JobRescuerIntervalDefault ,
7782 RescueAfter : JobRescuerRescueAfterDefault ,
7883 WorkUnitFactoryFunc : func (kind string ) workunit.WorkUnitFactory {
79- if kind == rescuerJobKind {
80- return & callbackWorkUnitFactory {Callback : func (ctx context.Context , jobRow * rivertype.JobRow ) error { return nil }}
84+ emptyCallback := func (ctx context.Context , jobRow * rivertype.JobRow ) error { return nil }
85+
86+ switch kind {
87+ case rescuerJobKind :
88+ return & callbackWorkUnitFactory {Callback : emptyCallback }
89+ case rescuerJobKindLongTimeout :
90+ return & callbackWorkUnitFactory {Callback : emptyCallback , timeout : JobRescuerRescueAfterDefault + 5 * time .Minute }
8191 }
8292 panic ("unhandled kind: " + kind )
8393 },
@@ -135,11 +145,18 @@ func TestJobRescuer(t *testing.T) {
135145 stuckToCancelJob1 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKind ), State : ptrutil .Ptr (rivertype .JobStateRunning ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), Metadata : []byte (fmt .Sprintf (`{"cancel_attempted_at": %q}` , cancelTime )), MaxAttempts : ptrutil .Ptr (5 )})
136146 stuckToCancelJob2 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKind ), State : ptrutil .Ptr (rivertype .JobStateRunning ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (1 * time .Minute )), Metadata : []byte (fmt .Sprintf (`{"cancel_attempted_at": %q}` , cancelTime )), MaxAttempts : ptrutil .Ptr (5 )}) // won't be rescued
137147
138- // these aren't touched:
148+ // these aren't touched because they're in ineligible states
139149 notRunningJob1 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKind ), FinalizedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), State : ptrutil .Ptr (rivertype .JobStateCompleted ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), MaxAttempts : ptrutil .Ptr (5 )})
140150 notRunningJob2 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKind ), FinalizedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), State : ptrutil .Ptr (rivertype .JobStateDiscarded ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), MaxAttempts : ptrutil .Ptr (5 )})
141151 notRunningJob3 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKind ), FinalizedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), State : ptrutil .Ptr (rivertype .JobStateCancelled ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Hour )), MaxAttempts : ptrutil .Ptr (5 )})
142152
153+ // Jobs with worker-specific long timeouts. The first isn't rescued
154+ // because the difference between its `attempted_at` and now is still
155+ // within the timeout threshold. The second _is_ rescued because it
156+ // started earlier and even with the longer timeout, has still timed out.
157+ longTimeOutJob1 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKindLongTimeout ), State : ptrutil .Ptr (rivertype .JobStateRunning ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 1 * time .Minute )), MaxAttempts : ptrutil .Ptr (5 )})
158+ longTimeOutJob2 := testfactory .Job (ctx , t , bundle .exec , & testfactory.JobOpts {Kind : ptrutil .Ptr (rescuerJobKindLongTimeout ), State : ptrutil .Ptr (rivertype .JobStateRunning ), AttemptedAt : ptrutil .Ptr (bundle .rescueHorizon .Add (- 6 * time .Minute )), MaxAttempts : ptrutil .Ptr (5 )})
159+
143160 require .NoError (cleaner .Start (ctx ))
144161
145162 cleaner .TestSignals .FetchedBatch .WaitOrTimeout ()
@@ -158,37 +175,44 @@ func TestJobRescuer(t *testing.T) {
158175 require .NoError (err )
159176 require .Equal (stuckToRetryJob3 .State , job3After .State ) // not rescued
160177
161- discard1After , err := bundle .exec .JobGetByID (ctx , stuckToDiscardJob1 .ID )
178+ discardJob1After , err := bundle .exec .JobGetByID (ctx , stuckToDiscardJob1 .ID )
162179 require .NoError (err )
163- require .Equal (rivertype .JobStateDiscarded , discard1After .State )
164- require .WithinDuration (time .Now (), * discard1After .FinalizedAt , 5 * time .Second )
165- require .Len (discard1After .Errors , 1 )
180+ require .Equal (rivertype .JobStateDiscarded , discardJob1After .State )
181+ require .WithinDuration (time .Now (), * discardJob1After .FinalizedAt , 5 * time .Second )
182+ require .Len (discardJob1After .Errors , 1 )
166183
167- discard2After , err := bundle .exec .JobGetByID (ctx , stuckToDiscardJob2 .ID )
184+ discardJob2After , err := bundle .exec .JobGetByID (ctx , stuckToDiscardJob2 .ID )
168185 require .NoError (err )
169- require .Equal (rivertype .JobStateRunning , discard2After .State )
170- require .Nil (discard2After .FinalizedAt )
186+ require .Equal (rivertype .JobStateRunning , discardJob2After .State )
187+ require .Nil (discardJob2After .FinalizedAt )
171188
172- cancel1After , err := bundle .exec .JobGetByID (ctx , stuckToCancelJob1 .ID )
189+ cancelJob1After , err := bundle .exec .JobGetByID (ctx , stuckToCancelJob1 .ID )
173190 require .NoError (err )
174- require .Equal (rivertype .JobStateCancelled , cancel1After .State )
175- require .WithinDuration (time .Now (), * cancel1After .FinalizedAt , 5 * time .Second )
176- require .Len (cancel1After .Errors , 1 )
191+ require .Equal (rivertype .JobStateCancelled , cancelJob1After .State )
192+ require .WithinDuration (time .Now (), * cancelJob1After .FinalizedAt , 5 * time .Second )
193+ require .Len (cancelJob1After .Errors , 1 )
177194
178- cancel2After , err := bundle .exec .JobGetByID (ctx , stuckToCancelJob2 .ID )
195+ cancelJob2After , err := bundle .exec .JobGetByID (ctx , stuckToCancelJob2 .ID )
179196 require .NoError (err )
180- require .Equal (rivertype .JobStateRunning , cancel2After .State )
181- require .Nil (cancel2After .FinalizedAt )
197+ require .Equal (rivertype .JobStateRunning , cancelJob2After .State )
198+ require .Nil (cancelJob2After .FinalizedAt )
182199
183- notRunning1After , err := bundle .exec .JobGetByID (ctx , notRunningJob1 .ID )
200+ notRunningJob1After , err := bundle .exec .JobGetByID (ctx , notRunningJob1 .ID )
184201 require .NoError (err )
185- require .Equal (notRunning1After .State , notRunningJob1 .State )
186- notRunning2After , err := bundle .exec .JobGetByID (ctx , notRunningJob2 .ID )
202+ require .Equal (notRunningJob1 .State , notRunningJob1After .State )
203+ notRunningJob2After , err := bundle .exec .JobGetByID (ctx , notRunningJob2 .ID )
204+ require .NoError (err )
205+ require .Equal (notRunningJob2 .State , notRunningJob2After .State )
206+ notRunningJob3After , err := bundle .exec .JobGetByID (ctx , notRunningJob3 .ID )
207+ require .NoError (err )
208+ require .Equal (notRunningJob3 .State , notRunningJob3After .State )
209+
210+ notTimedOutJob1After , err := bundle .exec .JobGetByID (ctx , longTimeOutJob1 .ID )
187211 require .NoError (err )
188- require .Equal (notRunning2After . State , notRunningJob2 .State )
189- notRunning3After , err := bundle .exec .JobGetByID (ctx , notRunningJob3 .ID )
212+ require .Equal (rivertype . JobStateRunning , notTimedOutJob1After .State )
213+ notTimedOutJob2After , err := bundle .exec .JobGetByID (ctx , longTimeOutJob2 .ID )
190214 require .NoError (err )
191- require .Equal (notRunning3After . State , notRunningJob3 .State )
215+ require .Equal (rivertype . JobStateRetryable , notTimedOutJob2After .State )
192216 })
193217
194218 t .Run ("RescuesInBatches" , func (t * testing.T ) {
0 commit comments