Skip to content

Commit ac281ca

Browse files
Merge pull request #7 from coreweave/ns/drop-remove-node-label
fix: stop using remove node label
2 parents c8aa786 + 6d42554 commit ac281ca

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3539
-1016
lines changed

balancer/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import (
2626
balancerclientset "k8s.io/autoscaler/balancer/pkg/client/clientset/versioned"
2727
balancerinformers "k8s.io/autoscaler/balancer/pkg/client/informers/externalversions"
2828
"k8s.io/autoscaler/balancer/pkg/controller"
29-
cacheddiscovery "k8s.io/client-go/discovery/cached"
29+
cacheddiscovery "k8s.io/client-go/discovery/cached/memory"
3030
"k8s.io/client-go/dynamic"
3131
kubeinformers "k8s.io/client-go/informers"
3232
"k8s.io/client-go/kubernetes"

cluster-autoscaler/FAQ.md

Lines changed: 149 additions & 148 deletions
Large diffs are not rendered by default.

cluster-autoscaler/cloudprovider/clusterapi/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,8 @@ metadata:
326326
cluster.x-k8s.io/autoscaling-options-scaledownunreadytime: "20m0s"
327327
# overrides --max-node-provision-time global value for that specific MachineDeployment
328328
cluster.x-k8s.io/autoscaling-options-maxnodeprovisiontime: "20m0s"
329+
# overrides --max-node-startup-time global value for that specific MachineDeployment
330+
cluster.x-k8s.io/autoscaling-options-maxnodestartuptime: "20m0s"
329331
```
330332

331333
#### CPU Architecture awareness for single-arch clusters

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,9 @@ func (ng *nodegroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*c
470470
if opt, ok := getDurationOption(options, ng.Id(), config.DefaultMaxNodeProvisionTimeKey); ok {
471471
defaults.MaxNodeProvisionTime = opt
472472
}
473+
if opt, ok := getDurationOption(options, ng.Id(), config.DefaultMaxNodeStartupTimeKey); ok {
474+
defaults.MaxNodeStartupTime = opt
475+
}
473476

474477
return &defaults, nil
475478
}

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,6 +1768,7 @@ func TestNodeGroupGetOptions(t *testing.T) {
17681768
ScaleDownUnneededTime: time.Second,
17691769
ScaleDownUnreadyTime: time.Minute,
17701770
MaxNodeProvisionTime: 15 * time.Minute,
1771+
MaxNodeStartupTime: 35 * time.Minute,
17711772
}
17721773

17731774
cases := []struct {
@@ -1788,13 +1789,15 @@ func TestNodeGroupGetOptions(t *testing.T) {
17881789
config.DefaultScaleDownUnneededTimeKey: "1h",
17891790
config.DefaultScaleDownUnreadyTimeKey: "30m",
17901791
config.DefaultMaxNodeProvisionTimeKey: "60m",
1792+
config.DefaultMaxNodeStartupTimeKey: "35m",
17911793
},
17921794
expected: &config.NodeGroupAutoscalingOptions{
17931795
ScaleDownGpuUtilizationThreshold: 0.6,
17941796
ScaleDownUtilizationThreshold: 0.7,
17951797
ScaleDownUnneededTime: time.Hour,
17961798
ScaleDownUnreadyTime: 30 * time.Minute,
17971799
MaxNodeProvisionTime: 60 * time.Minute,
1800+
MaxNodeStartupTime: 35 * time.Minute,
17981801
},
17991802
},
18001803
{
@@ -1809,6 +1812,7 @@ func TestNodeGroupGetOptions(t *testing.T) {
18091812
ScaleDownUnneededTime: time.Minute,
18101813
ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime,
18111814
MaxNodeProvisionTime: 15 * time.Minute,
1815+
MaxNodeStartupTime: 35 * time.Minute,
18121816
},
18131817
},
18141818
{

cluster-autoscaler/cloudprovider/coreweave/coreweave_nodegroup.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ package coreweave
1818

1919
import (
2020
"fmt"
21+
"sync"
2122

2223
apiv1 "k8s.io/api/core/v1"
2324
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2425
"k8s.io/autoscaler/cluster-autoscaler/config"
2526
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
2627
"k8s.io/klog/v2"
27-
"sync"
2828
)
2929

3030
// CoreWeaveNodeGroup represents a node group in the CoreWeave cloud provider.
@@ -84,13 +84,6 @@ func (ng *CoreWeaveNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
8484
if err != nil {
8585
return fmt.Errorf("some nodes do not belong to node group %s: %v", ng.Name, err)
8686
}
87-
// If we reach here, it means we can delete the nodes
88-
for _, node := range nodes {
89-
// Mark the node for removal
90-
if err := ng.nodepool.MarkNodeForRemoval(node); err != nil {
91-
return fmt.Errorf("failed to mark node %s for removal: %v", node.Name, err)
92-
}
93-
}
9487
//update target size
9588
if err := ng.nodepool.SetSize(ng.nodepool.GetTargetSize() - len(nodes)); err != nil {
9689
return fmt.Errorf("failed to update target size after marking nodes for removal: %v", err)
@@ -107,6 +100,9 @@ func (ng *CoreWeaveNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error {
107100
// DecreaseTargetSize decreases the target size of the node group by the specified delta.
108101
func (ng *CoreWeaveNodeGroup) DecreaseTargetSize(delta int) error {
109102
klog.V(4).Infof("Decreasing target size of node group %s by %d", ng.Name, delta)
103+
if delta < 0 {
104+
delta = -delta
105+
}
110106
return ng.nodepool.SetSize(ng.nodepool.GetTargetSize() - delta)
111107
}
112108

cluster-autoscaler/cloudprovider/coreweave/coreweave_nodegroup_test.go

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import (
2020
"context"
2121
"testing"
2222

23+
"github.com/stretchr/testify/require"
24+
2325
apiv1 "k8s.io/api/core/v1"
2426
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2527
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -109,18 +111,92 @@ func TestIncreaseSize(t *testing.T) {
109111
}
110112

111113
func TestDeleteNodes(t *testing.T) {
112-
ng := makeTestNodeGroup("ng-1", "uid-1", 0, 5, 3)
113-
validNode := &apiv1.Node{
114-
ObjectMeta: metav1.ObjectMeta{
115-
Name: "node1",
116-
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
114+
initialTargetSize := int64(3)
115+
116+
testCases := map[string]struct {
117+
nodesToDelete []*apiv1.Node
118+
expectedTargetSize int
119+
expectedError error
120+
}{
121+
"reduce-target-size-by-one-node": {
122+
nodesToDelete: []*apiv1.Node{
123+
{
124+
ObjectMeta: metav1.ObjectMeta{
125+
Name: "node1",
126+
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
127+
},
128+
},
129+
},
130+
expectedTargetSize: 2,
131+
},
132+
"reduce-target-size-by-three-node": {
133+
nodesToDelete: []*apiv1.Node{
134+
{
135+
ObjectMeta: metav1.ObjectMeta{
136+
Name: "node1",
137+
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
138+
},
139+
},
140+
{
141+
ObjectMeta: metav1.ObjectMeta{
142+
Name: "node2",
143+
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
144+
},
145+
},
146+
{
147+
ObjectMeta: metav1.ObjectMeta{
148+
Name: "node3",
149+
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
150+
},
151+
},
152+
},
153+
expectedTargetSize: 0,
117154
},
118155
}
119-
nodes := []*apiv1.Node{
120-
validNode,
156+
157+
for name, tc := range testCases {
158+
t.Run(name, func(t *testing.T) {
159+
ng := makeTestNodeGroup("ng-1", "uid-1", 0, 5, initialTargetSize)
160+
161+
err := ng.DeleteNodes(tc.nodesToDelete)
162+
if tc.expectedError != nil {
163+
require.Equal(t, tc.expectedError, err)
164+
return
165+
}
166+
require.NoError(t, err)
167+
require.Equal(t, ng.nodepool.GetTargetSize(), tc.expectedTargetSize)
168+
})
121169
}
122-
err := ng.DeleteNodes(nodes)
123-
if err != nil && err != cloudprovider.ErrNotImplemented {
124-
t.Errorf("expected ErrNotImplemented or nil, got %v", err)
170+
}
171+
172+
func TestDecreaseTargetSize(t *testing.T) {
173+
testCases := map[string]struct {
174+
delta int
175+
expectedTargetSize int
176+
expectedError error
177+
}{
178+
"positive-delta": {
179+
delta: 2,
180+
expectedTargetSize: 1,
181+
},
182+
"negative-delta": {
183+
delta: -2,
184+
expectedTargetSize: 1,
185+
},
186+
}
187+
188+
for name, tc := range testCases {
189+
t.Run(name, func(t *testing.T) {
190+
ng := makeTestNodeGroup("ng-1", "uid-1", 1, 5, 3)
191+
192+
err := ng.DecreaseTargetSize(tc.delta)
193+
if tc.expectedError != nil {
194+
require.Error(t, err)
195+
require.Equal(t, tc.expectedError, err)
196+
return
197+
}
198+
require.NoError(t, err)
199+
require.Equal(t, tc.expectedTargetSize, ng.nodepool.GetTargetSize())
200+
})
125201
}
126202
}

cluster-autoscaler/cloudprovider/coreweave/coreweave_nodepool.go

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -187,42 +187,6 @@ func (np *CoreWeaveNodePool) SetSize(size int) error {
187187
return nil
188188
}
189189

190-
// MarkNodeForRemoval marks a node for removal from the node pool.
191-
func (np *CoreWeaveNodePool) MarkNodeForRemoval(node *apiv1.Node) error {
192-
ctx, cancel := GetCoreWeaveContext()
193-
defer cancel()
194-
if node == nil {
195-
return fmt.Errorf("node cannot be nil")
196-
}
197-
if node.Name == "" {
198-
return fmt.Errorf("node name cannot be empty")
199-
}
200-
// Log the node being marked for removal
201-
klog.V(4).Infof("Marking node %s for removal from node pool %s", node.Name, np.GetName())
202-
// Fetch the current node object
203-
currentNode, err := np.client.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{})
204-
if err != nil {
205-
return fmt.Errorf("failed to get node %s: %v", node.Name, err)
206-
}
207-
// Check if the node belongs to this node pool
208-
if currentNode.Labels == nil || currentNode.Labels[coreWeaveNodePoolUID] != np.GetUID() {
209-
return fmt.Errorf("node %s does not belong to node pool %s", node.Name, np.GetName())
210-
}
211-
// Check if the node is already marked for removal
212-
if currentNode.Labels != nil && currentNode.Labels[coreWeaveRemoveNode] == "true" {
213-
klog.V(4).Infof("Node %s is already marked for removal", currentNode.Name)
214-
return nil // Node is already marked for removal, no action needed
215-
}
216-
// Set the label to indicate the node should be removed
217-
currentNode.Labels[coreWeaveRemoveNode] = "true"
218-
// Update the node using the client
219-
_, err = np.client.CoreV1().Nodes().Update(ctx, currentNode, metav1.UpdateOptions{})
220-
if err != nil {
221-
return fmt.Errorf("failed to mark node %s for removal: %v", node.Name, err)
222-
}
223-
return nil
224-
}
225-
226190
// ValidateNodes checks if the provided nodes belong to the node pool.
227191
func (np *CoreWeaveNodePool) ValidateNodes(nodes []*apiv1.Node) error {
228192
if len(nodes) == 0 {

0 commit comments

Comments
 (0)