Skip to content

Commit 9c63003

Browse files
make stuck jobs and the HTZ interface more resiliant
1 parent 8f069f6 commit 9c63003

5 files changed

Lines changed: 65 additions & 49 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
.DS_Store
2+
*.user
13
bin/
24
obj/
35
/packages/

Database/DbContext.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ public DateTime LastStateTime
106106
return Lifecycle.MaxBy(x => x.EventTimeUtc).EventTimeUtc;
107107
}
108108
}
109+
110+
public bool StuckJobReplacement { get; set; } = false;
109111
}
110112

111113
public enum RunnerStatus

PoolManager.cs

Lines changed: 56 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,6 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)
5050

5151
List<GithubTargetConfiguration> targetConfig = Program.Config.TargetConfigs;
5252

53-
// Cull runners
54-
List<Server> allHtzSrvs = await _cc.GetAllServersFromCsp();
55-
5653
await CleanUpRunners(targetConfig);
5754
await StartPoolRunners(targetConfig);
5855
_logger.LogInformation("Poolmanager init done.");
@@ -297,7 +294,9 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
297294
Arch = arch,
298295
IPv4 = string.Empty,
299296
IsCustom = profile != "default",
300-
Owner = stuckJob.Owner
297+
Owner = stuckJob.Owner,
298+
StuckJobReplacement = true
299+
301300
};
302301
await db.Runners.AddAsync(newRunner);
303302
await db.SaveChangesAsync();
@@ -422,55 +421,64 @@ private async Task CleanUpRunners(List<GithubTargetConfiguration> targetConfigs)
422421
}
423422

424423
// Remove every VM that's not in the github registered runners
425-
List<Server> remainingHtzServer = await _cc.GetAllServersFromCsp();
426-
foreach (Server htzSrv in remainingHtzServer)
424+
try
427425
{
428-
if (registeredServerNames.Contains(htzSrv.Name))
426+
List<Server> remainingHtzServer = await _cc.GetAllServersFromCsp();
427+
foreach (Server htzSrv in remainingHtzServer)
429428
{
430-
// If we know the server in github, skip
431-
continue;
432-
}
433-
_logger.LogInformation($"{htzSrv.Name} is a candidate to be killed from Hetzner");
429+
if (registeredServerNames.Contains(htzSrv.Name))
430+
{
431+
// If we know the server in github, skip
432+
continue;
433+
}
434434

435-
var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.CloudServerId == htzSrv.Id);
436-
if (runner == null)
437-
{
438-
_logger.LogInformation($"{htzSrv.Name} is not found in the database");
439-
continue;
440-
}
441-
if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued))
442-
{
443-
runner.Lifecycle.Add(new()
435+
_logger.LogInformation($"{htzSrv.Name} is a candidate to be killed from Hetzner");
436+
437+
var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.CloudServerId == htzSrv.Id);
438+
if (runner == null)
444439
{
445-
Status = RunnerStatus.DeletionQueued,
446-
Event = "Don't queue deletion due to Github registration. Runner already queued for deletion.",
447-
EventTimeUtc = DateTime.UtcNow
448-
});
449-
await db.SaveChangesAsync();
450-
451-
}
452-
else if ((runner.LastState >= RunnerStatus.Provisioned && DateTime.UtcNow - runner.LastStateTime > TimeSpan.FromMinutes(5)) ||
453-
(runner.LastState != RunnerStatus.Processing && DateTime.UtcNow - htzSrv.Created.ToUniversalTime() > TimeSpan.FromMinutes(40)))
454-
{
455-
_logger.LogInformation($"Removing VM that is not in any GitHub registration: {htzSrv.Name} created at {htzSrv.Created:u}");
456-
runner.IsOnline = false;
457-
runner.Lifecycle.Add(new()
440+
_logger.LogInformation($"{htzSrv.Name} is not found in the database");
441+
continue;
442+
}
443+
444+
if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued))
458445
{
459-
Status = RunnerStatus.DeletionQueued,
460-
Event = "Removing as VM not longer in any GitHub registration",
461-
EventTimeUtc = DateTime.UtcNow
462-
});
463-
await db.SaveChangesAsync();
464-
_queues.DeleteTasks.Enqueue(new()
446+
runner.Lifecycle.Add(new()
447+
{
448+
Status = RunnerStatus.DeletionQueued,
449+
Event = "Don't queue deletion due to Github registration. Runner already queued for deletion.",
450+
EventTimeUtc = DateTime.UtcNow
451+
});
452+
await db.SaveChangesAsync();
453+
454+
}
455+
else if ((runner.LastState >= RunnerStatus.Provisioned && DateTime.UtcNow - runner.LastStateTime > TimeSpan.FromMinutes(5)) ||
456+
(runner.LastState != RunnerStatus.Processing && DateTime.UtcNow - htzSrv.Created.ToUniversalTime() > TimeSpan.FromMinutes(40)))
465457
{
466-
RunnerDbId = runner.RunnerId,
467-
ServerId = htzSrv.Id
468-
});
469-
458+
_logger.LogInformation($"Removing VM that is not in any GitHub registration: {htzSrv.Name} created at {htzSrv.Created:u}");
459+
runner.IsOnline = false;
460+
runner.Lifecycle.Add(new()
461+
{
462+
Status = RunnerStatus.DeletionQueued,
463+
Event = "Removing as VM not longer in any GitHub registration",
464+
EventTimeUtc = DateTime.UtcNow
465+
});
466+
await db.SaveChangesAsync();
467+
_queues.DeleteTasks.Enqueue(new()
468+
{
469+
RunnerDbId = runner.RunnerId,
470+
ServerId = htzSrv.Id
471+
});
472+
473+
}
474+
470475
}
471-
472476
}
473-
477+
catch (Exception ex)
478+
{
479+
_logger.LogError($"Failed during cleanup from CSP: {ex.Message}");
480+
}
481+
474482
}
475483

476484
private async Task<bool> DeleteRunner(DeleteRunnerTask rt)
@@ -564,13 +572,14 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
564572
Event = $"Unable to create runner [{runner.Size} on {runner.Arch} | Retry: {rt.RetryCount}]: {ex.Message}"
565573
});
566574
rt.RetryCount += 1;
567-
if (rt.RetryCount < 3)
575+
// Don't retry stuck job runners - the stuck job detector will create retry servers
576+
if (rt.RetryCount < 3 && !runner.StuckJobReplacement)
568577
{
569578
_queues.CreateTasks.Enqueue(rt);
570579
}
571580
else
572581
{
573-
_logger.LogError($"Retries exceeded for {runner.Size} on {runner.Arch}. giving up.");
582+
_logger.LogError(runner.StuckJobReplacement ? $"Retries exceeded for {runner.Size} on {runner.Arch}. giving up. (Stuck job replacement)" : $"Retries exceeded for {runner.Size} on {runner.Arch}. giving up.");
574583
runner.Lifecycle.Add(new RunnerLifecycle
575584
{
576585
Status = RunnerStatus.Failure,

Program.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public static void Main(string[] args)
7676
Log.Error($"Hetzner cloud token not set in {configPath}");
7777
return;
7878
}
79+
7980

8081
Log.Information($"Loaded {Config.TargetConfigs.Count} targets and {Config.Sizes.Count} sizes.");
8182

@@ -111,6 +112,7 @@ public static void Main(string[] args)
111112
builder.Services.AddHostedService<PoolManager>();
112113

113114
// Add services to the container.
115+
114116

115117
// Learn more about configuring Swagger/OpenAPI at https://aka.ms/aspnetcore/swashbuckle
116118
builder.Services.AddCors(options =>
@@ -490,6 +492,7 @@ await db.Jobs.AddAsync(new Job
490492

491493
double secondsAlive = (DateTime.UtcNow - jobRunner.CreateTime).TotalSeconds;
492494
TotalMachineTime.Labels(job.Owner, jobRunner.Size).Inc(secondsAlive);
495+
493496
}
494497

495498
private static async Task JobInProgress(JsonElement workflowJson, ILogger<Program> logger, long jobId,

Properties/launchSettings.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
"applicationUrl": "http://localhost:5178",
1818
"environmentVariables": {
1919
"ASPNETCORE_ENVIRONMENT": "Development",
20-
"CONFIG_DIR": "/Users/markuskeil/dev/tmp",
21-
"PERSIST_DIR": "/Users/markuskeil/dev/tmp"
20+
"CONFIG_DIR": "/Users/markus/dev/tmp",
21+
"PERSIST_DIR": "/Users/markus/dev/tmp"
2222
}
2323
},
2424
"https": {

0 commit comments

Comments
 (0)