Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
bdb5d95
make maxNodeStartupTime configurable
lxuan94-pp Sep 15, 2025
83cd97c
Add Unit Tests
lxuan94-pp Sep 16, 2025
dd06ef4
fix: deprecated import of cacheddiscovery
gsixo Nov 19, 2025
543e339
chore: rework Scaleway cloudprovider integration
pablo-ruth Sep 18, 2025
62dd8bc
fix gofmt test
pablo-ruth Nov 10, 2025
2f9d5b4
fix: deprecated import of cacheddiscovery
gsixo Nov 20, 2025
799aabf
Merge pull request #8782 from pablo-ruth/upstream-scaleway-rework
k8s-ci-robot Nov 20, 2025
f2a54b6
Adjust OWNERS so that only API changes need api review
adrianmoisey Nov 21, 2025
5d19bdf
Merge pull request #8845 from adrianmoisey/adjust-owners-for-api-changes
k8s-ci-robot Nov 21, 2025
818639e
fix: vpa_recommender_vpa_objects_count initialization (#8750)
cmtly Nov 21, 2025
82018c9
Merge pull request #8829 from gsixo/fix-of-deprecated-import-in-pod-a…
k8s-ci-robot Nov 21, 2025
5e7f7a1
Merge pull request #8543 from lxuan94-pp/xualiliu/oci-maxNodeStartupTime
k8s-ci-robot Nov 21, 2025
7b95cb0
Merge pull request #8833 from gsixo/lala
k8s-ci-robot Nov 21, 2025
5873c7f
Add Intel GPU (Habana Gaudi) autoscaler support
DorWeinstock Nov 24, 2025
cc49907
Refactor GPU allocatable detection into reusable function
DorWeinstock Nov 25, 2025
ffcbfee
Merge pull request #8853 from DorWeinstock/add-intel-gaudi-support
k8s-ci-robot Nov 26, 2025
97e45c5
Adding metrics for latency of removal for unneeded/ unready nodes
ttetyanka Aug 28, 2025
fb2899a
Merge pull request #8485 from ttetyanka/feature/deletionlatencytracker
k8s-ci-robot Dec 1, 2025
6d42554
fix: drop remove node label and fix decrease size func
nickstern2002 Dec 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion balancer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (
balancerclientset "k8s.io/autoscaler/balancer/pkg/client/clientset/versioned"
balancerinformers "k8s.io/autoscaler/balancer/pkg/client/informers/externalversions"
"k8s.io/autoscaler/balancer/pkg/controller"
cacheddiscovery "k8s.io/client-go/discovery/cached"
cacheddiscovery "k8s.io/client-go/discovery/cached/memory"
"k8s.io/client-go/dynamic"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
Expand Down
297 changes: 149 additions & 148 deletions cluster-autoscaler/FAQ.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/clusterapi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,8 @@ metadata:
cluster.x-k8s.io/autoscaling-options-scaledownunreadytime: "20m0s"
# overrides --max-node-provision-time global value for that specific MachineDeployment
cluster.x-k8s.io/autoscaling-options-maxnodeprovisiontime: "20m0s"
# overrides --max-node-startup-time global value for that specific MachineDeployment
cluster.x-k8s.io/autoscaling-options-maxnodestartuptime: "20m0s"
```

#### CPU Architecture awareness for single-arch clusters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,9 @@ func (ng *nodegroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*c
if opt, ok := getDurationOption(options, ng.Id(), config.DefaultMaxNodeProvisionTimeKey); ok {
defaults.MaxNodeProvisionTime = opt
}
if opt, ok := getDurationOption(options, ng.Id(), config.DefaultMaxNodeStartupTimeKey); ok {
defaults.MaxNodeStartupTime = opt
}

return &defaults, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1768,6 +1768,7 @@ func TestNodeGroupGetOptions(t *testing.T) {
ScaleDownUnneededTime: time.Second,
ScaleDownUnreadyTime: time.Minute,
MaxNodeProvisionTime: 15 * time.Minute,
MaxNodeStartupTime: 35 * time.Minute,
}

cases := []struct {
Expand All @@ -1788,13 +1789,15 @@ func TestNodeGroupGetOptions(t *testing.T) {
config.DefaultScaleDownUnneededTimeKey: "1h",
config.DefaultScaleDownUnreadyTimeKey: "30m",
config.DefaultMaxNodeProvisionTimeKey: "60m",
config.DefaultMaxNodeStartupTimeKey: "35m",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownGpuUtilizationThreshold: 0.6,
ScaleDownUtilizationThreshold: 0.7,
ScaleDownUnneededTime: time.Hour,
ScaleDownUnreadyTime: 30 * time.Minute,
MaxNodeProvisionTime: 60 * time.Minute,
MaxNodeStartupTime: 35 * time.Minute,
},
},
{
Expand All @@ -1809,6 +1812,7 @@ func TestNodeGroupGetOptions(t *testing.T) {
ScaleDownUnneededTime: time.Minute,
ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime,
MaxNodeProvisionTime: 15 * time.Minute,
MaxNodeStartupTime: 35 * time.Minute,
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ package coreweave

import (
"fmt"
"sync"

apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/klog/v2"
"sync"
)

// CoreWeaveNodeGroup represents a node group in the CoreWeave cloud provider.
Expand Down Expand Up @@ -84,13 +84,6 @@ func (ng *CoreWeaveNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
if err != nil {
return fmt.Errorf("some nodes do not belong to node group %s: %v", ng.Name, err)
}
// If we reach here, it means we can delete the nodes
for _, node := range nodes {
// Mark the node for removal
if err := ng.nodepool.MarkNodeForRemoval(node); err != nil {
return fmt.Errorf("failed to mark node %s for removal: %v", node.Name, err)
}
}
//update target size
if err := ng.nodepool.SetSize(ng.nodepool.GetTargetSize() - len(nodes)); err != nil {
return fmt.Errorf("failed to update target size after marking nodes for removal: %v", err)
Expand All @@ -107,6 +100,9 @@ func (ng *CoreWeaveNodeGroup) ForceDeleteNodes(nodes []*apiv1.Node) error {
// DecreaseTargetSize decreases the target size of the node group by the specified delta.
func (ng *CoreWeaveNodeGroup) DecreaseTargetSize(delta int) error {
klog.V(4).Infof("Decreasing target size of node group %s by %d", ng.Name, delta)
if delta < 0 {
delta = -delta
}
return ng.nodepool.SetSize(ng.nodepool.GetTargetSize() - delta)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"context"
"testing"

"github.com/stretchr/testify/require"

apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
Expand Down Expand Up @@ -109,18 +111,92 @@ func TestIncreaseSize(t *testing.T) {
}

func TestDeleteNodes(t *testing.T) {
ng := makeTestNodeGroup("ng-1", "uid-1", 0, 5, 3)
validNode := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
initialTargetSize := int64(3)

testCases := map[string]struct {
nodesToDelete []*apiv1.Node
expectedTargetSize int
expectedError error
}{
"reduce-target-size-by-one-node": {
nodesToDelete: []*apiv1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
},
},
},
expectedTargetSize: 2,
},
"reduce-target-size-by-three-node": {
nodesToDelete: []*apiv1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node2",
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node3",
Labels: map[string]string{coreWeaveNodePoolUID: "uid-1"},
},
},
},
expectedTargetSize: 0,
},
}
nodes := []*apiv1.Node{
validNode,

for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
ng := makeTestNodeGroup("ng-1", "uid-1", 0, 5, initialTargetSize)

err := ng.DeleteNodes(tc.nodesToDelete)
if tc.expectedError != nil {
require.Equal(t, tc.expectedError, err)
return
}
require.NoError(t, err)
require.Equal(t, ng.nodepool.GetTargetSize(), tc.expectedTargetSize)
})
}
err := ng.DeleteNodes(nodes)
if err != nil && err != cloudprovider.ErrNotImplemented {
t.Errorf("expected ErrNotImplemented or nil, got %v", err)
}

func TestDecreaseTargetSize(t *testing.T) {
testCases := map[string]struct {
delta int
expectedTargetSize int
expectedError error
}{
"positive-delta": {
delta: 2,
expectedTargetSize: 1,
},
"negative-delta": {
delta: -2,
expectedTargetSize: 1,
},
}

for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
ng := makeTestNodeGroup("ng-1", "uid-1", 1, 5, 3)

err := ng.DecreaseTargetSize(tc.delta)
if tc.expectedError != nil {
require.Error(t, err)
require.Equal(t, tc.expectedError, err)
return
}
require.NoError(t, err)
require.Equal(t, tc.expectedTargetSize, ng.nodepool.GetTargetSize())
})
}
}
36 changes: 0 additions & 36 deletions cluster-autoscaler/cloudprovider/coreweave/coreweave_nodepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,42 +187,6 @@ func (np *CoreWeaveNodePool) SetSize(size int) error {
return nil
}

// MarkNodeForRemoval marks a node for removal from the node pool.
func (np *CoreWeaveNodePool) MarkNodeForRemoval(node *apiv1.Node) error {
ctx, cancel := GetCoreWeaveContext()
defer cancel()
if node == nil {
return fmt.Errorf("node cannot be nil")
}
if node.Name == "" {
return fmt.Errorf("node name cannot be empty")
}
// Log the node being marked for removal
klog.V(4).Infof("Marking node %s for removal from node pool %s", node.Name, np.GetName())
// Fetch the current node object
currentNode, err := np.client.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to get node %s: %v", node.Name, err)
}
// Check if the node belongs to this node pool
if currentNode.Labels == nil || currentNode.Labels[coreWeaveNodePoolUID] != np.GetUID() {
return fmt.Errorf("node %s does not belong to node pool %s", node.Name, np.GetName())
}
// Check if the node is already marked for removal
if currentNode.Labels != nil && currentNode.Labels[coreWeaveRemoveNode] == "true" {
klog.V(4).Infof("Node %s is already marked for removal", currentNode.Name)
return nil // Node is already marked for removal, no action needed
}
// Set the label to indicate the node should be removed
currentNode.Labels[coreWeaveRemoveNode] = "true"
// Update the node using the client
_, err = np.client.CoreV1().Nodes().Update(ctx, currentNode, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("failed to mark node %s for removal: %v", node.Name, err)
}
return nil
}

// ValidateNodes checks if the provided nodes belong to the node pool.
func (np *CoreWeaveNodePool) ValidateNodes(nodes []*apiv1.Node) error {
if len(nodes) == 0 {
Expand Down
Loading