Skip to content

Commit 8633870

Browse files
committed
Parametrise MaxHealthCheckRetries for containerd
Signed-off-by: Antonio Murdaca <runcom@redhat.com>
1 parent a278c97 commit 8633870

3 files changed

Lines changed: 51 additions & 30 deletions

File tree

cmd/dockerd/daemon_unix.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ func (cli *DaemonCli) getPlatformRemoteOptions() []libcontainerd.RemoteOption {
6262
opts := []libcontainerd.RemoteOption{
6363
libcontainerd.WithDebugLog(cli.Config.Debug),
6464
libcontainerd.WithOOMScore(cli.Config.OOMScoreAdjust),
65+
libcontainerd.WithMaxHealthCheckRetries(cli.Config.MaxHealthCheckRetries),
6566
}
6667
if cli.Config.ContainerdAddr != "" {
6768
opts = append(opts, libcontainerd.WithRemoteAddr(cli.Config.ContainerdAddr))

daemon/config_unix.go

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,19 @@ type Config struct {
2626
CommonUnixConfig
2727

2828
// Fields below here are platform specific.
29-
CgroupParent string `json:"cgroup-parent,omitempty"`
30-
EnableSelinuxSupport bool `json:"selinux-enabled,omitempty"`
31-
RemappedRoot string `json:"userns-remap,omitempty"`
32-
Ulimits map[string]*units.Ulimit `json:"default-ulimits,omitempty"`
33-
CPURealtimePeriod int64 `json:"cpu-rt-period,omitempty"`
34-
CPURealtimeRuntime int64 `json:"cpu-rt-runtime,omitempty"`
35-
OOMScoreAdjust int `json:"oom-score-adjust,omitempty"`
36-
Init bool `json:"init,omitempty"`
37-
InitPath string `json:"init-path,omitempty"`
38-
SeccompProfile string `json:"seccomp-profile,omitempty"`
39-
SigCheck bool `json:"signature-verification"`
40-
EnableSecrets bool `json:"enable-secrets"`
29+
CgroupParent string `json:"cgroup-parent,omitempty"`
30+
EnableSelinuxSupport bool `json:"selinux-enabled,omitempty"`
31+
RemappedRoot string `json:"userns-remap,omitempty"`
32+
Ulimits map[string]*units.Ulimit `json:"default-ulimits,omitempty"`
33+
CPURealtimePeriod int64 `json:"cpu-rt-period,omitempty"`
34+
CPURealtimeRuntime int64 `json:"cpu-rt-runtime,omitempty"`
35+
OOMScoreAdjust int `json:"oom-score-adjust,omitempty"`
36+
MaxHealthCheckRetries int `json:"max-health-check-retries,omitempty"`
37+
Init bool `json:"init,omitempty"`
38+
InitPath string `json:"init-path,omitempty"`
39+
SeccompProfile string `json:"seccomp-profile,omitempty"`
40+
SigCheck bool `json:"signature-verification"`
41+
EnableSecrets bool `json:"enable-secrets"`
4142
}
4243

4344
// bridgeConfig stores all the bridge driver specific
@@ -86,6 +87,7 @@ func (config *Config) InstallFlags(flags *pflag.FlagSet) {
8687
flags.StringVar(&config.ContainerdAddr, "containerd", "", "Path to containerd socket")
8788
flags.BoolVar(&config.LiveRestoreEnabled, "live-restore", false, "Enable live restore of docker when containers are still running")
8889
flags.IntVar(&config.OOMScoreAdjust, "oom-score-adjust", -500, "Set the oom_score_adj for the daemon")
90+
flags.IntVar(&config.MaxHealthCheckRetries, "max-health-check-retries", 3, "Set the maximum number of health check retries before forced containerd restart")
8991
flags.BoolVar(&config.Init, "init", false, "Run an init in the container to forward signals and reap processes")
9092
flags.StringVar(&config.InitPath, "init-path", "", "Path to the docker-init binary")
9193
flags.Int64Var(&config.CPURealtimePeriod, "cpu-rt-period", 0, "Limit the CPU real-time period in microseconds")

libcontainerd/remote_unix.go

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
)
3434

3535
const (
36-
maxConnectionRetryCount = 3
3736
containerdHealthCheckTimeout = 3 * time.Second
3837
containerdShutdownTimeout = 15 * time.Second
3938
containerdBinary = "docker-containerd"
@@ -45,22 +44,23 @@ const (
4544

4645
type remote struct {
4746
sync.RWMutex
48-
apiClient containerd.APIClient
49-
daemonPid int
50-
stateDir string
51-
rpcAddr string
52-
startDaemon bool
53-
closeManually bool
54-
debugLog bool
55-
rpcConn *grpc.ClientConn
56-
clients []*client
57-
eventTsPath string
58-
runtime string
59-
runtimeArgs []string
60-
daemonWaitCh chan struct{}
61-
liveRestore bool
62-
oomScore int
63-
restoreFromTimestamp *timestamp.Timestamp
47+
apiClient containerd.APIClient
48+
daemonPid int
49+
stateDir string
50+
rpcAddr string
51+
startDaemon bool
52+
closeManually bool
53+
debugLog bool
54+
rpcConn *grpc.ClientConn
55+
clients []*client
56+
eventTsPath string
57+
runtime string
58+
runtimeArgs []string
59+
daemonWaitCh chan struct{}
60+
liveRestore bool
61+
oomScore int
62+
maxHealthCheckRetries int
63+
restoreFromTimestamp *timestamp.Timestamp
6464
}
6565

6666
// New creates a fresh instance of libcontainerd remote.
@@ -139,6 +139,8 @@ func (r *remote) UpdateOptions(options ...RemoteOption) error {
139139
func (r *remote) handleConnectionChange() {
140140
var transientFailureCount = 0
141141

142+
logrus.Debugf("libcontainerd: maximum number of retries for containerd health check is %d", r.maxHealthCheckRetries)
143+
142144
ticker := time.NewTicker(500 * time.Millisecond)
143145
defer ticker.Stop()
144146
healthClient := grpc_health_v1.NewHealthClient(r.rpcConn)
@@ -162,7 +164,7 @@ func (r *remote) handleConnectionChange() {
162164
// all other errors are transient
163165
// Reset state to be notified of next failure
164166
transientFailureCount++
165-
if transientFailureCount >= maxConnectionRetryCount {
167+
if transientFailureCount >= r.maxHealthCheckRetries {
166168
transientFailureCount = 0
167169
if utils.IsProcessAlive(r.daemonPid) {
168170
logrus.Infof("killing and restarting containerd")
@@ -553,3 +555,19 @@ func (o oomScore) Apply(r Remote) error {
553555
}
554556
return fmt.Errorf("WithOOMScore option not supported for this remote")
555557
}
558+
559+
// WithMaxHealthCheckRetries defines the maximum number of consecutive failed 'HealthCheckRequest'
560+
// before handleConnectionChange() forcibly kills and restarts containerd.
561+
func WithMaxHealthCheckRetries(cnt int) RemoteOption {
562+
return retryCnt(cnt)
563+
}
564+
565+
type retryCnt int
566+
567+
func (cnt retryCnt) Apply(r Remote) error {
568+
if remote, ok := r.(*remote); ok {
569+
remote.maxHealthCheckRetries = int(cnt)
570+
return nil
571+
}
572+
return fmt.Errorf("WithMaxHealthCheckRetries option not supported for this remote")
573+
}

0 commit comments

Comments
 (0)