@@ -50,9 +50,6 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)
5050
5151 List < GithubTargetConfiguration > targetConfig = Program . Config . TargetConfigs ;
5252
53- // Cull runners
54- List < Server > allHtzSrvs = await _cc . GetAllServersFromCsp ( ) ;
55-
5653 await CleanUpRunners ( targetConfig ) ;
5754 await StartPoolRunners ( targetConfig ) ;
5855 _logger . LogInformation ( "Poolmanager init done." ) ;
@@ -297,7 +294,9 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
297294 Arch = arch ,
298295 IPv4 = string . Empty ,
299296 IsCustom = profile != "default" ,
300- Owner = stuckJob . Owner
297+ Owner = stuckJob . Owner ,
298+ StuckJobReplacement = true
299+
301300 } ;
302301 await db . Runners . AddAsync ( newRunner ) ;
303302 await db . SaveChangesAsync ( ) ;
@@ -422,55 +421,64 @@ private async Task CleanUpRunners(List<GithubTargetConfiguration> targetConfigs)
422421 }
423422
424423 // Remove every VM that's not in the github registered runners
425- List < Server > remainingHtzServer = await _cc . GetAllServersFromCsp ( ) ;
426- foreach ( Server htzSrv in remainingHtzServer )
424+ try
427425 {
428- if ( registeredServerNames . Contains ( htzSrv . Name ) )
426+ List < Server > remainingHtzServer = await _cc . GetAllServersFromCsp ( ) ;
427+ foreach ( Server htzSrv in remainingHtzServer )
429428 {
430- // If we know the server in github, skip
431- continue ;
432- }
433- _logger . LogInformation ( $ "{ htzSrv . Name } is a candidate to be killed from Hetzner") ;
429+ if ( registeredServerNames . Contains ( htzSrv . Name ) )
430+ {
431+ // If we know the server in github, skip
432+ continue ;
433+ }
434434
435- var runner = await db . Runners . Include ( x => x . Lifecycle ) . FirstOrDefaultAsync ( x => x . CloudServerId == htzSrv . Id ) ;
436- if ( runner == null )
437- {
438- _logger . LogInformation ( $ "{ htzSrv . Name } is not found in the database") ;
439- continue ;
440- }
441- if ( runner . Lifecycle . Any ( x => x . Status == RunnerStatus . DeletionQueued ) )
442- {
443- runner . Lifecycle . Add ( new ( )
435+ _logger . LogInformation ( $ "{ htzSrv . Name } is a candidate to be killed from Hetzner") ;
436+
437+ var runner = await db . Runners . Include ( x => x . Lifecycle ) . FirstOrDefaultAsync ( x => x . CloudServerId == htzSrv . Id ) ;
438+ if ( runner == null )
444439 {
445- Status = RunnerStatus . DeletionQueued ,
446- Event = "Don't queue deletion due to Github registration. Runner already queued for deletion." ,
447- EventTimeUtc = DateTime . UtcNow
448- } ) ;
449- await db . SaveChangesAsync ( ) ;
450-
451- }
452- else if ( ( runner . LastState >= RunnerStatus . Provisioned && DateTime . UtcNow - runner . LastStateTime > TimeSpan . FromMinutes ( 5 ) ) ||
453- ( runner . LastState != RunnerStatus . Processing && DateTime . UtcNow - htzSrv . Created . ToUniversalTime ( ) > TimeSpan . FromMinutes ( 40 ) ) )
454- {
455- _logger . LogInformation ( $ "Removing VM that is not in any GitHub registration: { htzSrv . Name } created at { htzSrv . Created : u} ") ;
456- runner . IsOnline = false ;
457- runner . Lifecycle . Add ( new ( )
440+ _logger . LogInformation ( $ "{ htzSrv . Name } is not found in the database") ;
441+ continue ;
442+ }
443+
444+ if ( runner . Lifecycle . Any ( x => x . Status == RunnerStatus . DeletionQueued ) )
458445 {
459- Status = RunnerStatus . DeletionQueued ,
460- Event = "Removing as VM not longer in any GitHub registration" ,
461- EventTimeUtc = DateTime . UtcNow
462- } ) ;
463- await db . SaveChangesAsync ( ) ;
464- _queues . DeleteTasks . Enqueue ( new ( )
446+ runner . Lifecycle . Add ( new ( )
447+ {
448+ Status = RunnerStatus . DeletionQueued ,
449+ Event = "Don't queue deletion due to Github registration. Runner already queued for deletion." ,
450+ EventTimeUtc = DateTime . UtcNow
451+ } ) ;
452+ await db . SaveChangesAsync ( ) ;
453+
454+ }
455+ else if ( ( runner . LastState >= RunnerStatus . Provisioned && DateTime . UtcNow - runner . LastStateTime > TimeSpan . FromMinutes ( 5 ) ) ||
456+ ( runner . LastState != RunnerStatus . Processing && DateTime . UtcNow - htzSrv . Created . ToUniversalTime ( ) > TimeSpan . FromMinutes ( 40 ) ) )
465457 {
466- RunnerDbId = runner . RunnerId ,
467- ServerId = htzSrv . Id
468- } ) ;
469-
458+ _logger . LogInformation ( $ "Removing VM that is not in any GitHub registration: { htzSrv . Name } created at { htzSrv . Created : u} ") ;
459+ runner . IsOnline = false ;
460+ runner . Lifecycle . Add ( new ( )
461+ {
462+ Status = RunnerStatus . DeletionQueued ,
463+ Event = "Removing as VM not longer in any GitHub registration" ,
464+ EventTimeUtc = DateTime . UtcNow
465+ } ) ;
466+ await db . SaveChangesAsync ( ) ;
467+ _queues . DeleteTasks . Enqueue ( new ( )
468+ {
469+ RunnerDbId = runner . RunnerId ,
470+ ServerId = htzSrv . Id
471+ } ) ;
472+
473+ }
474+
470475 }
471-
472476 }
473-
477+ catch ( Exception ex )
478+ {
479+ _logger . LogError ( $ "Failed during cleanup from CSP: { ex . Message } ") ;
480+ }
481+
474482 }
475483
476484 private async Task < bool > DeleteRunner ( DeleteRunnerTask rt )
@@ -564,13 +572,14 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
564572 Event = $ "Unable to create runner [{ runner . Size } on { runner . Arch } | Retry: { rt . RetryCount } ]: { ex . Message } "
565573 } ) ;
566574 rt . RetryCount += 1 ;
567- if ( rt . RetryCount < 3 )
575+ // Don't retry stuck job runners - the stuck job detector will create retry servers
576+ if ( rt . RetryCount < 3 && ! runner . StuckJobReplacement )
568577 {
569578 _queues . CreateTasks . Enqueue ( rt ) ;
570579 }
571580 else
572581 {
573- _logger . LogError ( $ "Retries exceeded for { runner . Size } on { runner . Arch } . giving up.") ;
582+ _logger . LogError ( runner . StuckJobReplacement ? $ "Retries exceeded for { runner . Size } on { runner . Arch } . giving up. (Stuck job replacement)" : $ "Retries exceeded for { runner . Size } on { runner . Arch } . giving up.") ;
574583 runner . Lifecycle . Add ( new RunnerLifecycle
575584 {
576585 Status = RunnerStatus . Failure ,
0 commit comments