Skip to content

Commit 1a4a9b2

Browse files
committed
Operator operator-lifecycle-manager-packageserver is in Available=False state running in SNO
Update generated mock: fix import alias for apps/v1 Remove goimports_vendorlesspath overlay to fix counterfeiter conflict The hack/overlays/goimports_vendorlesspath.go overlay was causing conflicts with counterfeiter mock generation because the vendored golang.org/x/tools/imports now includes VendorlessPath function. This overlay is no longer needed as the function is now available in the vendored dependency. Fixes: make verify counterfeiter errors
1 parent c1c1696 commit 1a4a9b2

File tree

5 files changed

+300
-33
lines changed

5 files changed

+300
-33
lines changed

Makefile

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -340,11 +340,7 @@ mockgen: #HELP Generate mocks
340340
# Consider using counterfeiter:generate directives to speed things up.
341341
# See https://github.com/maxbrunsfeld/counterfeiter#step-2b---add-counterfeitergenerate-directives for more information.
342342
# Set the "COUNTERFEITER_NO_GENERATE_WARNING" environment variable to suppress this message.
343-
@set -e; \
344-
overlay_file=$$(mktemp "$(CURDIR)/hack/overlays/goimports_overlay.XXXXXX.json"); \
345-
trap 'rm -f "$$overlay_file"' EXIT; \
346-
printf '{\n "Replace": {\n "%s/vendor/golang.org/x/tools/imports/vendorlesspath.go": "%s/hack/overlays/goimports_vendorlesspath.go"\n }\n}\n' "$(CURDIR)" "$(CURDIR)" > "$$overlay_file"; \
347-
GO111MODULE=on GOWORK=off COUNTERFEITER_NO_GENERATE_WARNING=1 GOFLAGS="$$GOFLAGS -overlay=$$overlay_file" go generate ./pkg/...
343+
GO111MODULE=on GOWORK=off COUNTERFEITER_NO_GENERATE_WARNING=1 go generate ./pkg/...
348344

349345
#SECTION Verification
350346

hack/overlays/goimports_vendorlesspath.go

Lines changed: 0 additions & 14 deletions
This file was deleted.

pkg/controller/operators/olm/apiservices.go

Lines changed: 75 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,9 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
148148
if serviceAccountName == "" {
149149
serviceAccountName = "default"
150150
}
151-
_, err = a.opClient.KubernetesInterface().CoreV1().ServiceAccounts(deployment.GetNamespace()).Get(context.TODO(), serviceAccountName, metav1.GetOptions{})
151+
// Use lister instead of direct API call to avoid timeout issues during SNO upgrades
152+
// when the kube-apiserver is temporarily unavailable
153+
_, err = a.lister.CoreV1().ServiceAccountLister().ServiceAccounts(deployment.GetNamespace()).Get(serviceAccountName)
152154
if err != nil {
153155
logger.WithError(err).WithField("serviceaccount", serviceAccountName).Warnf("could not retrieve ServiceAccount")
154156
errs = append(errs, err)
@@ -213,9 +215,23 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
213215
}
214216

215217
// Check if deployment is being updated or rolling out
216-
if deployment.Status.UnavailableReplicas > 0 ||
217-
deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
218-
a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
218+
// This includes several scenarios:
219+
// 1. UnavailableReplicas > 0: Some replicas are not ready
220+
// 2. UpdatedReplicas < Replicas: Rollout in progress
221+
// 3. Generation != ObservedGeneration: Spec changed but not yet observed
222+
// 4. AvailableReplicas < desired: Not all replicas are available yet
223+
isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
224+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
225+
deployment.Generation != deployment.Status.ObservedGeneration ||
226+
deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
227+
228+
if isRollingOut {
229+
a.logger.Debugf("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption",
230+
deploymentSpec.Name,
231+
deployment.Status.UnavailableReplicas,
232+
deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
233+
deployment.Status.AvailableReplicas, *deployment.Spec.Replicas,
234+
deployment.Status.ObservedGeneration, deployment.Generation)
219235

220236
// Check pod status to confirm disruption
221237
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
@@ -230,6 +246,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
230246
continue
231247
}
232248

249+
// For single-replica deployments during rollout, if no pods exist yet,
250+
// this is likely the brief window where the old pod is gone and new pod
251+
// hasn't been created yet. This is expected disruption during upgrade.
252+
// According to the OpenShift contract: "A component must not report Available=False
253+
// during the course of a normal upgrade."
254+
if len(pods) == 0 && *deployment.Spec.Replicas == 1 && isRollingOut {
255+
a.logger.Infof("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed", deploymentSpec.Name)
256+
return true
257+
}
258+
259+
// Track if we found any real failures or expected disruptions
260+
foundExpectedDisruption := false
261+
foundRealFailure := false
262+
233263
// Check if any pod is in expected disruption state
234264
for _, pod := range pods {
235265
// Check how long the pod has been in disrupted state
@@ -244,7 +274,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
244274
// Pod is terminating (DeletionTimestamp is set)
245275
if pod.DeletionTimestamp != nil {
246276
a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
247-
return true
277+
foundExpectedDisruption = true
278+
continue
248279
}
249280

250281
// For pending pods, we need to distinguish between expected disruption
@@ -257,11 +288,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
257288
// Check pod conditions for scheduling issues
258289
for _, condition := range pod.Status.Conditions {
259290
if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
260-
// If pod has been unschedulable for a while, it's likely a real issue
261-
// not a temporary disruption from cluster upgrade
262291
if condition.Reason == "Unschedulable" {
263-
isRealFailure = true
264-
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
292+
// CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
293+
// due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
294+
// This is especially common in single-node clusters or control plane scenarios.
295+
// Per OpenShift contract: "A component must not report Available=False during normal upgrade."
296+
if *deployment.Spec.Replicas == 1 && isRollingOut {
297+
a.logger.Infof("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption", pod.Name)
298+
isExpectedDisruption = true
299+
} else {
300+
// Multi-replica or non-rollout Unschedulable is a real issue
301+
isRealFailure = true
302+
foundRealFailure = true
303+
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
304+
}
265305
break
266306
}
267307
}
@@ -278,6 +318,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
278318
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
279319
// These are real failures, not temporary disruptions
280320
isRealFailure = true
321+
foundRealFailure = true
281322
a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
282323
}
283324
}
@@ -292,6 +333,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
292333
isExpectedDisruption = true
293334
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
294335
isRealFailure = true
336+
foundRealFailure = true
295337
a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
296338
}
297339
}
@@ -302,17 +344,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
302344
continue
303345
}
304346

305-
// If it's in expected disruption state, return true
347+
// If it's in expected disruption state, mark it
306348
if isExpectedDisruption {
307349
a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
308-
return true
350+
foundExpectedDisruption = true
351+
continue
309352
}
310353

311354
// If pending without clear container status, check if it's just being scheduled
312355
// This could be normal pod creation during node drain
313356
if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
314357
a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
315-
return true
358+
foundExpectedDisruption = true
359+
continue
316360
}
317361
}
318362

@@ -323,14 +367,32 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
323367
switch reason {
324368
case "ContainerCreating", "PodInitializing":
325369
a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
326-
return true
370+
foundExpectedDisruption = true
327371
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
328372
// Real failures - don't treat as disruption
329373
a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
374+
foundRealFailure = true
330375
}
331376
}
332377
}
333378
}
379+
380+
// After checking all pods, make a decision
381+
// If we found expected disruption and no real failures, treat as disruption
382+
if foundExpectedDisruption && !foundRealFailure {
383+
a.logger.Infof("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract", deploymentSpec.Name)
384+
return true
385+
}
386+
387+
// For single-replica deployments during rollout, if we found no real failures,
388+
// treat as expected disruption to comply with the OpenShift contract:
389+
// "A component must not report Available=False during the course of a normal upgrade."
390+
// Single-replica deployments inherently have unavailability during rollout,
391+
// but this is acceptable and should not trigger Available=False.
392+
if !foundRealFailure && *deployment.Spec.Replicas == 1 && isRollingOut {
393+
a.logger.Infof("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract", deploymentSpec.Name)
394+
return true
395+
}
334396
}
335397
}
336398

0 commit comments

Comments
 (0)