From f4999680f50eb481bf6089575f4269946ccfd167 Mon Sep 17 00:00:00 2001 From: Chai Bot Date: Mon, 29 Jun 2026 00:32:44 +0000 Subject: [PATCH] Make applyClusterTLSProfile() non-fatal to prevent crash-loops in HyperShift hosted clusters applyClusterTLSProfile() fetches the cluster APIServer CR on startup with a hard timeout. In HyperShift hosted clusters the API server may not be ready when packageserver starts, causing the fatal error path to crash-loop the container indefinitely. Make the error non-fatal: log a warning and continue with defaults instead of returning a fatal error. The Package Server Manager (PSM) will inject the correct TLS flags on the next reconciliation anyway, so crashing is unnecessary. Also reduce the lookup timeout from 30s to 10s so we don't stall startup when the API is genuinely unreachable. Validated in downstream openshift/operator-framework-olm PR #1333, where the payload job e2e-aws-ovn passed with this fix. Downstream tracking: https://redhat.atlassian.net/browse/TRT-2761 Co-Authored-By: Claude Opus 4.6 --- pkg/package-server/server/server.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/package-server/server/server.go b/pkg/package-server/server/server.go index a6e8a3216c..019aefe77b 100644 --- a/pkg/package-server/server/server.go +++ b/pkg/package-server/server/server.go @@ -227,7 +227,8 @@ func (o *PackageServerOptions) Run(ctx context.Context) error { // honours the cluster TLS security profile on first boot or during upgrades. if o.SecureServing.MinTLSVersion == "" { if err := applyClusterTLSProfile(ctx, clientConfig, o.SecureServing); err != nil { - return fmt.Errorf("failed to apply cluster TLS profile to serving options: %w", err) + log.WithError(err).Warn("Failed to apply cluster TLS profile to serving options, continuing with defaults. " + + "PSM will inject the correct TLS flags on next reconciliation.") } } @@ -348,7 +349,7 @@ func (op *Operator) syncOLMConfig(obj interface{}) error { // This is the fallback path used when --tls-min-version is not provided via flags // (i.e. before the PSM has had a chance to inject them). func applyClusterTLSProfile(ctx context.Context, config *rest.Config, serving *genericoptions.SecureServingOptionsWithLoopback) error { - const lookupTimeout = 30 * time.Second + const lookupTimeout = 10 * time.Second profileCtx, cancel := context.WithTimeout(ctx, lookupTimeout) defer cancel()