Skip to content

Commit b3ac8c7

Browse files
committed
feat: rework AKS machine/VM name handling
1 parent dccad3f commit b3ac8c7

File tree

9 files changed

+311
-70
lines changed

9 files changed

+311
-70
lines changed

pkg/cloudprovider/cloudprovider.go

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -358,36 +358,26 @@ func (c *CloudProvider) Get(ctx context.Context, providerID string) (*karpv1.Nod
358358
}
359359
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("vmName", vmName))
360360

361-
aksMachinesPoolName := options.FromContext(ctx).AKSMachinesPoolName
362-
if aksMachineName, err := instance.GetAKSMachineNameFromVMName(aksMachinesPoolName, vmName); err == nil {
363-
// This could be an AKS machine-based node; try getting the AKS machine instance
361+
if aksMachineName, err := instance.GetAKSMachineNameFromVMName(options.FromContext(ctx).AKSMachinesPoolName, vmName); err == nil {
362+
// AKS machine-based node
364363
ctx := log.IntoContext(ctx, log.FromContext(ctx).WithValues("aksMachineName", aksMachineName))
365364

366365
aksMachine, err := c.aksMachineInstanceProvider.Get(ctx, aksMachineName)
367-
if err == nil {
368-
nodeClaim, err := c.resolveNodeClaimFromAKSMachine(ctx, aksMachine)
369-
if err != nil {
370-
return nil, fmt.Errorf("converting AKS machine instance to node claim, %w", err)
371-
}
372-
return nodeClaim, nil
373-
} else if cloudprovider.IgnoreNodeClaimNotFoundError(err) != nil {
366+
if err != nil {
374367
return nil, fmt.Errorf("getting AKS machine instance, %w", err)
375368
}
376-
// Fallback to legacy VM-based node if not found
377-
// In the case that it is indeed AKS machine node, but somehow fail GET AKS machine and succeeded GET VM, ideally we want this call to fail.
378-
// However, being misrepresented only once is not fatal. "Illegal" in-place update will be reconciled back to the before, and there is no drift for VM nodes that won't happen with AKS machine nodes + DriftAction.
379-
// So, we could live with this for now.
380-
}
381-
382-
vm, err := c.vmInstanceProvider.Get(ctx, vmName)
383-
if err != nil {
384-
return nil, fmt.Errorf("getting VM instance, %w", err)
385-
}
386-
instanceType, err := c.resolveInstanceTypeFromVMInstance(ctx, vm)
387-
if err != nil {
388-
return nil, fmt.Errorf("resolving instance type, %w", err)
369+
return c.resolveNodeClaimFromAKSMachine(ctx, aksMachine)
370+
} else {
371+
vm, err := c.vmInstanceProvider.Get(ctx, vmName)
372+
if err != nil {
373+
return nil, fmt.Errorf("getting VM instance, %w", err)
374+
}
375+
instanceType, err := c.resolveInstanceTypeFromVMInstance(ctx, vm)
376+
if err != nil {
377+
return nil, fmt.Errorf("resolving instance type, %w", err)
378+
}
379+
return c.vmInstanceToNodeClaim(ctx, vm, instanceType)
389380
}
390-
return c.vmInstanceToNodeClaim(ctx, vm, instanceType)
391381
}
392382

393383
func (c *CloudProvider) LivenessProbe(req *http.Request) error {

pkg/cloudprovider/suite_aksmachineapi_integration_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ var _ = Describe("CloudProvider", func() {
161161
Expect(err).To(HaveOccurred())
162162
Expect(corecloudprovider.IsNodeClaimNotFoundError(err)).To(BeTrue())
163163
Expect(azureEnv.AKSMachinesAPI.AKSMachineGetBehavior.CalledWithInput.Len()).To(Equal(1))
164-
Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(1)) // Should be bothered as AKS machine is not found, so suspect this to be a VM
164+
Expect(azureEnv.VirtualMachinesAPI.VirtualMachineGetBehavior.CalledWithInput.Len()).To(Equal(0)) // Should not be bothered
165165
Expect(nodeClaim).To(BeNil())
166166
})
167167

pkg/fake/aksmachinesapi.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -506,9 +506,9 @@ func (c *AKSMachinesAPI) setDefaultMachineValues(machine *armcontainerservice.Ma
506506
}
507507

508508
// Set ResourceID - simulates VM resource ID
509-
// vmName = aks-<machinesPoolName>-<aksMachineName>-########-vm#
509+
// vmName = aks-<machinesPoolName>-<aksMachineName>-########-vm
510510
if machine.Properties.ResourceID == nil {
511-
vmName := fmt.Sprintf("aks-%s-%s-12345678-vm0", agentPoolName, *machine.Name)
511+
vmName := fmt.Sprintf("aks-%s-%s-12345678-vm", agentPoolName, *machine.Name)
512512
vmResourceID := fmt.Sprintf("/subscriptions/subscriptionID/resourceGroups/test-resourceGroup/providers/Microsoft.Compute/virtualMachines/%s", vmName)
513513
machine.Properties.ResourceID = lo.ToPtr(vmResourceID)
514514
}

pkg/providers/instance/aksmachineinstance.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"github.com/Azure/karpenter-provider-azure/pkg/providers/imagefamily"
3636
"github.com/Azure/karpenter-provider-azure/pkg/providers/instance/offerings"
3737
"github.com/Azure/karpenter-provider-azure/pkg/providers/instancetype"
38+
"github.com/Azure/karpenter-provider-azure/pkg/providers/launchtemplate"
3839
"github.com/Azure/karpenter-provider-azure/pkg/utils"
3940
)
4041

@@ -174,7 +175,10 @@ func (p *DefaultAKSMachineProvider) BeginCreate(
174175
nodeClaim *karpv1.NodeClaim,
175176
instanceTypes []*corecloudprovider.InstanceType,
176177
) (*AKSMachinePromise, error) {
177-
aksMachineName := GetAKSMachineNameFromNodeClaimName(nodeClaim.Name)
178+
aksMachineName, err := GetAKSMachineNameFromNodeClaimName(nodeClaim.Name)
179+
if err != nil {
180+
return nil, fmt.Errorf("failed to generate AKS machine name from NodeClaim name %q: %w", nodeClaim.Name, err)
181+
}
178182
instanceTypes = offerings.OrderInstanceTypesByPrice(instanceTypes, scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...))
179183

180184
aksMachinePromise, err := p.beginCreateMachine(ctx, nodeClass, nodeClaim, instanceTypes, aksMachineName)
@@ -376,7 +380,7 @@ func (p *DefaultAKSMachineProvider) beginCreateMachine(
376380
existingAKSMachine, err := p.getMachine(ctx, aksMachineName)
377381
if err == nil {
378382
// Existing AKS machine found, reuse it.
379-
return p.reuseExistingMachine(ctx, aksMachineName, nodeClass, instanceTypes, existingAKSMachine)
383+
return p.reuseExistingMachine(ctx, aksMachineName, nodeClass, nodeClaim, instanceTypes, existingAKSMachine)
380384
} else if !IsAKSMachineOrMachinesPoolNotFound(err) {
381385
// Not fatal. Will fall back to normal creation.
382386
log.FromContext(ctx).Error(err, "failed to check for existing AKS machine", "aksMachineName", aksMachineName)
@@ -502,11 +506,17 @@ func (p *DefaultAKSMachineProvider) handleMachineProvisioningError(ctx context.C
502506
return fmt.Errorf("failed to create AKS machine %q during %s, unhandled provisioning error: code=%s, message=%s", aksMachineName, phase, lo.FromPtr(innerError.Code), lo.FromPtr(innerError.Message))
503507
}
504508

505-
func (p *DefaultAKSMachineProvider) reuseExistingMachine(ctx context.Context, aksMachineName string, nodeClass *v1beta1.AKSNodeClass, instanceTypes []*corecloudprovider.InstanceType, existingAKSMachine *armcontainerservice.Machine) (*AKSMachinePromise, error) {
509+
func (p *DefaultAKSMachineProvider) reuseExistingMachine(ctx context.Context, aksMachineName string, nodeClass *v1beta1.AKSNodeClass, nodeClaim *karpv1.NodeClaim, instanceTypes []*corecloudprovider.InstanceType, existingAKSMachine *armcontainerservice.Machine) (*AKSMachinePromise, error) {
506510
// Reconstruct properties from existing AKS machine instance.
507511
if err := validateRetrievedAKSMachineBasicProperties(existingAKSMachine); err != nil {
508512
return nil, fmt.Errorf("found existing AKS machine %s, but %w", aksMachineName, err)
509513
}
514+
if existingAKSMachine.Properties.Tags == nil || existingAKSMachine.Properties.Tags[launchtemplate.KarpenterAKSMachineNodeClaimTagKey] == nil {
515+
// This is not included in validateRetrievedAKSMachineBasicProperties as inplaceupdate can repair it.
516+
// Although, we don't want to reuse a machine until that happens.
517+
return nil, fmt.Errorf("found existing AKS machine %s, but %w", aksMachineName, fmt.Errorf("irretrievable karpenter.azure.com_aksmachine_nodeclaim tag"))
518+
}
519+
510520
var existingAKSMachineZone string
511521
if len(existingAKSMachine.Zones) == 0 || existingAKSMachine.Zones[0] == nil {
512522
existingAKSMachineZone = "" // No zone
@@ -518,11 +528,18 @@ func (p *DefaultAKSMachineProvider) reuseExistingMachine(ctx context.Context, ak
518528
existingAKSMachineVMResourceID := lo.FromPtr(existingAKSMachine.Properties.ResourceID)
519529
existingAKSMachineID := lo.FromPtr(existingAKSMachine.ID)
520530
existingAKSMachineNodeImageVersion := lo.FromPtr(existingAKSMachine.Properties.NodeImageVersion)
531+
existingAKSMachineNodeClaimName := lo.FromPtr(existingAKSMachine.Properties.Tags[launchtemplate.KarpenterAKSMachineNodeClaimTagKey])
521532

522533
instanceType := offerings.GetInstanceTypeFromVMSize(existingAKSMachineVMSize, instanceTypes)
523534
capacityType := GetCapacityTypeFromAKSScaleSetPriority(existingAKSMachinePriority)
524535
zone := utils.GetAKSLabelZoneFromARMZone(p.aksMachinesPoolLocation, existingAKSMachineZone)
525536

537+
if existingAKSMachineNodeClaimName != nodeClaim.Name {
538+
// Might be possible from NodePool name hash collision within AKS machine name
539+
// See how AKS machine name is generated for more details.
540+
// ASSUMPTION: repeated failure will eventually result in NodeClaim reaching registration TTL, then gets re-created with the new hash, recovering from the collision.
541+
return nil, fmt.Errorf("found existing AKS machine %s, but its karpenter.azure.com_aksmachine_nodeclaim tag %q does not match the NodeClaim to create %q", aksMachineName, existingAKSMachineNodeClaimName, nodeClaim.Name)
542+
}
526543
if existingAKSMachine.Properties.ProvisioningState != nil && lo.FromPtr(existingAKSMachine.Properties.ProvisioningState) == "Failed" {
527544
// Unfortunately, that was more like a remain than a usable aksMachine.
528545
// ASSUMPTION: this is irrecoverable (i.e., polling would have failed).

pkg/providers/instance/aksmachineinstanceutils.go

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ func BuildNodeClaimFromAKSMachineTemplate(
100100
// By the time of writing, this is being used for logging purposes within provider only.
101101
// That is unlikely to change for core. But be mindful of provider is to rely on this in that situation. Still, rare.
102102
// This was less of a concern for VM instance as NodeClaim name is always inferrable from instance name.
103-
// XPMT: TODO(Bryce-Soghigian): see if you have better ideas. Otherwise just delete this TODO and live with this.
104103
nodeClaim.Name = *tag
105104
}
106105
nodeClaim.Labels = labels
@@ -182,17 +181,36 @@ func FindNodePoolFromAKSMachine(ctx context.Context, aksMachine *armcontainerser
182181
return nil, errors.NewNotFound(schema.GroupResource{Group: coreapis.Group, Resource: "nodepools"}, "")
183182
}
184183

185-
// XPMT: TODO(Bryce-Soghigian): rework this thing below? Also consider add acceptance tests on this in cloudprovider module, if applicable.
186-
// Real node name would be aks-<machinesPoolName>-<aksMachineName>-########-vm#. E.g., aks-aksmanagedap-default-2jf98-11274290-vm2.
187-
func GetAKSMachineNameFromNodeClaimName(nodeClaimName string) string {
188-
// ASSUMPTION: all AKS machines are named after the NodeClaim name.
189-
// Does not guarantee that the NodeClaim is already associated with an AKS machine.
190-
// This assumption is weaker than the one in GetAKSMachineNameFromNodeClaim(), but still, not breaking anytime soon.
191-
// return nodeClaimName
184+
// ASSUMPTION: NodeClaim name is in the format of <NodePool name>-<hash suffix>
185+
// If total length exceeds AKS machine name limit, the exceeded part will be replaced with another deterministic hash.
186+
// E.g., "thisisalongnodepoolname-a1b2c" --> "thisisalongnoz9y8x7-a1b2c"
187+
func GetAKSMachineNameFromNodeClaimName(nodeClaimName string) (string, error) {
188+
const maxAKSMachineNameLength = 35 // Defined by AKS machine API.
189+
const prefixHashLength = 6 // The length of the hashed part replacing the exceeded part of the prefix.
190+
// If 6, given alphanumeric hash, there will be a total of 36^6 = 2,176,782,336 combinations.
192191

193-
// TEMPORARY
192+
if len(nodeClaimName) <= maxAKSMachineNameLength {
193+
// Safe to use the whole name
194+
return nodeClaimName, nil
195+
}
194196
splitted := strings.Split(nodeClaimName, "-")
195-
return "x" + splitted[len(splitted)-1]
197+
// Combine the parts except the last one (NodeClaim hash suffix)
198+
prefix := strings.Join(splitted[:len(splitted)-1], "-")
199+
suffix := "-" + splitted[len(splitted)-1]
200+
201+
// Keep the legit part of the prefix intact, but hash the rest
202+
// ASSUMPTION: prefix length is at least 6 characters at this point (which means suffix length is not too large)
203+
// At the time of writing, suffix length is 6 (e.g., "-a1b2c"). This is unlikely to change.
204+
keepPrefixLength := maxAKSMachineNameLength - len(suffix) - prefixHashLength
205+
prefixToKeep := prefix[:keepPrefixLength]
206+
prefixToHash := prefix[keepPrefixLength:]
207+
hashedPrefixToHash, err := utils.GetAlphanumericHash(prefixToHash, 6)
208+
if err != nil {
209+
return "", fmt.Errorf("failed to hash exceeded AKS machine name prefix %q: %w", prefixToHash, err)
210+
}
211+
212+
hashTrimmedPrefix := prefixToKeep + hashedPrefixToHash
213+
return hashTrimmedPrefix + suffix, nil
196214
}
197215

198216
// GetAKSMachineNameFromNodeClaim extracts the AKS machine name from the NodeClaim annotations
@@ -212,10 +230,8 @@ func GetCapacityTypeFromAKSScaleSetPriority(scaleSetPriority armcontainerservice
212230
return AKSScaleSetPriorityToKarpCapacityType[scaleSetPriority]
213231
}
214232

215-
// vmName = aks-<machinesPoolName>-<aksMachineName>-########-vm#
216-
// Note: there is no accurate way to tell from the VM name that the VM is created by AKS machine API.
217-
// E.g., a non-AKS machine VM from Karpenter nodepool named "aksmanagedap-aks-nodepool-abcde-12345678" with suffix "vms12" would result in VM name "aks-aksmanagedap-aks-nodepool-abcde-12345678-vm12", which can be interpreted as a AKS machine with pool name "aksmanagedap" and name "aks-nodepool-abcde".
218-
// The validation below is, thus, best-effort.
233+
// vmName = aks-<machinesPoolName>-<aksMachineName>-########-vm
234+
// This is distinguishable from VM instance name as its suffix will always be 5 alphanumerics rather than "vm"
219235
func GetAKSMachineNameFromVMName(aksMachinesPoolName, vmName string) (string, error) {
220236
if !strings.HasPrefix(vmName, "aks-"+aksMachinesPoolName+"-") {
221237
return "", fmt.Errorf("vm name %s does not start with expected prefix aks-%s-", vmName, aksMachinesPoolName)
@@ -226,11 +242,11 @@ func GetAKSMachineNameFromVMName(aksMachinesPoolName, vmName string) (string, er
226242
if len(splitted) < 3 {
227243
return "", fmt.Errorf("vm name %s does not have enough parts after prefix aks-%s-", vmName, aksMachinesPoolName)
228244
}
229-
// Check whether the last part starts with "vm"
230-
if !strings.HasPrefix(splitted[len(splitted)-1], "vm") {
231-
return "", fmt.Errorf("vm name %s does not end with expected suffix vm#", vmName)
245+
// Check whether the last part is "vm"
246+
if splitted[len(splitted)-1] != "vm" {
247+
return "", fmt.Errorf("vm name %s does not end with expected suffix vm", vmName)
232248
}
233-
// Remove the last two parts (########-vm#) and join the rest to get the AKS machine name
249+
// Remove the last two parts (########-vm) and join the rest to get the AKS machine name
234250
aksMachineName := strings.Join(splitted[:len(splitted)-2], "-")
235251

236252
return aksMachineName, nil

0 commit comments

Comments
 (0)