Skip to content

Commit 2960312

Browse files
matthchrBryce-Soghigiancharliedmcb
authored
fix: Don't block PATCH VM if billing extension is not installed (#1155)
There are currently cases where a node can join the cluster successfully but not have the billing extension installed. This blocks identity propagation and tags propagation. We tolerate a missing billing extension since this is currently possible. Co-authored-by: Bryce Soghigian <[email protected]> Co-authored-by: Charlie McBride <[email protected]>
1 parent 7c3307f commit 2960312

File tree

3 files changed

+63
-0
lines changed

3 files changed

+63
-0
lines changed

pkg/controllers/nodeclaim/inplaceupdate/controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *karpv1.NodeClaim)
112112
if err != nil {
113113
return reconcile.Result{}, client.IgnoreNotFound(err)
114114
}
115+
log.FromContext(ctx).V(1).Info("successfully saved new in-place update hash", "goalHash", goalHash)
115116

116117
return reconcile.Result{}, nil
117118
}

pkg/providers/instance/instance.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"errors"
2222
"fmt"
2323
"math"
24+
"net/http"
2425
"sort"
2526
"time"
2627

@@ -236,6 +237,15 @@ func (p *DefaultProvider) Update(ctx context.Context, vmName string, update armc
236237
nil,
237238
)
238239
if err != nil {
240+
// TODO: This is a bit of a hack based on how this Update function is currently used.
241+
// Currently this function will not be called by any callers until a claim has been Registered, which means that the CSE had to have succeeded.
242+
// The aksIdentifyingExtensionName is not currently guaranteed to be on the VM though, as Karpenter could have failed over during the initial VM create
243+
// after CSE but before aksIdentifyingExtensionName. So, for now, we just ignore NotFound errors for the aksIdentifyingExtensionName.
244+
azErr := sdkerrors.IsResponseError(err)
245+
if extName == aksIdentifyingExtensionName && azErr != nil && azErr.StatusCode == http.StatusNotFound {
246+
log.FromContext(ctx).V(0).Info("extension not found when updating tags", "extensionName", extName, "vmName", vmName)
247+
continue
248+
}
239249
return fmt.Errorf("updating VM extension %q for VM %q: %w", extName, vmName, err)
240250
}
241251
pollers[extName] = poller

pkg/providers/instance/suite_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package instance_test
1919
import (
2020
"context"
2121
"fmt"
22+
"net/http"
2223
"strings"
2324
"testing"
2425
"time"
@@ -41,6 +42,7 @@ import (
4142
"sigs.k8s.io/karpenter/pkg/test/v1alpha1"
4243
. "sigs.k8s.io/karpenter/pkg/utils/testing"
4344

45+
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
4446
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
4547
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork"
4648
"github.com/Azure/karpenter-provider-azure/pkg/apis"
@@ -570,5 +572,55 @@ var _ = Describe("InstanceProvider", func() {
570572
"test-tag": lo.ToPtr("test-value"),
571573
})
572574
})
575+
576+
It("should ignore NotFound errors for computeAksLinuxBilling extension update", func() {
577+
// Ensure that the VM already exists in the fake environment
578+
vmName := nodeClaim.Name
579+
vm := armcompute.VirtualMachine{
580+
ID: lo.ToPtr(fake.MkVMID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName)),
581+
Name: lo.ToPtr(vmName),
582+
Tags: map[string]*string{
583+
"karpenter.azure.com_cluster": lo.ToPtr("test-cluster"),
584+
},
585+
}
586+
// Ensure that the NIC already exists in the fake environment
587+
azureEnv.VirtualMachinesAPI.Instances.Store(*vm.ID, vm)
588+
nic := armnetwork.Interface{
589+
ID: lo.ToPtr(fake.MakeNetworkInterfaceID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName)),
590+
Name: lo.ToPtr(vmName),
591+
Tags: map[string]*string{
592+
"karpenter.azure.com_cluster": lo.ToPtr("test-cluster"),
593+
},
594+
}
595+
azureEnv.NetworkInterfacesAPI.NetworkInterfaces.Store(*nic.ID, nic)
596+
597+
// Ensure that only one extension exists in the env
598+
cseExt := armcompute.VirtualMachineExtension{
599+
ID: lo.ToPtr(fake.MakeVMExtensionID(azureEnv.AzureResourceGraphAPI.ResourceGroup, vmName, "cse-agent-karpenter")),
600+
Name: lo.ToPtr("cse-agent-karpenter"),
601+
Tags: map[string]*string{
602+
"karpenter.azure.com_cluster": lo.ToPtr("test-cluster"),
603+
},
604+
}
605+
azureEnv.VirtualMachineExtensionsAPI.Extensions.Store(*cseExt.ID, cseExt)
606+
// TODO: This only works because this extension happens to be first in the list of extensions. If it were second it wouldn't work
607+
azureEnv.VirtualMachineExtensionsAPI.VirtualMachineExtensionsUpdateBehavior.BeginError.Set(&azcore.ResponseError{StatusCode: http.StatusNotFound}, fake.MaxCalls(1))
608+
609+
ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass)
610+
611+
// Update the VM tags
612+
err := azureEnv.InstanceProvider.Update(ctx, vmName, armcompute.VirtualMachineUpdate{
613+
Tags: map[string]*string{
614+
"karpenter.azure.com_cluster": lo.ToPtr("test-cluster"),
615+
"test-tag": lo.ToPtr("test-value"),
616+
},
617+
})
618+
Expect(err).ToNot(HaveOccurred())
619+
620+
ExpectInstanceResourcesHaveTags(ctx, vmName, azureEnv, map[string]*string{
621+
"karpenter.azure.com_cluster": lo.ToPtr("test-cluster"),
622+
"test-tag": lo.ToPtr("test-value"),
623+
})
624+
})
573625
})
574626
})

0 commit comments

Comments
 (0)