Skip to content

Commit c716ed0

Browse files
authored
Merge pull request #7547 from rapidsai/release/25.12
Forward-merge release/25.12 into main
2 parents 88a97e6 + fc06d08 commit c716ed0

File tree

11 files changed

+281
-255
lines changed

11 files changed

+281
-255
lines changed

docs/source/cuml-accel/limitations.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ A few additional general notes:
5252
or categorical formats (e.g., using scikit-learn's LabelEncoder) prior to
5353
processing.
5454

55-
- The accelerator is compatible with scikit-learn version 1.4 or higher. This
56-
compatibility ensures that cuML's implementation of scikit-learn compatible
57-
APIs works as expected.
55+
- The accelerator is tested to be compatible with scikit-learn versions 1.4
56+
through 1.7. This ensures that cuML's implementation of scikit-learn
57+
compatible APIs works as expected.
5858

5959
- Error and warning messages and formats may differ from scikit-learn. Some
6060
errors might present as C++ stacktraces instead of python errors.

python/cuml/cuml/accel/estimator_proxy.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
from typing import Any
88

99
import sklearn
10+
from packaging.version import Version
1011
from sklearn.base import BaseEstimator, ClassNamePrefixFeaturesOutMixin
1112
from sklearn.utils._set_output import _wrap_data_with_container
1213

1314
from cuml.accel import profilers
1415
from cuml.accel.core import logger
1516
from cuml.internals.interop import UnsupportedOnGPU, is_fitted
1617

18+
SKLEARN_18 = Version(sklearn.__version__) >= Version("1.8.0.dev0")
19+
1720

1821
def is_proxy(instance_or_class) -> bool:
1922
"""Check if an instance or class is a proxy object created by the accelerator."""
@@ -477,10 +480,22 @@ def get_metadata_routing(self):
477480
def _get_metadata_request(self):
478481
return self._cpu._get_metadata_request()
479482

480-
@classmethod
481-
@functools.wraps(BaseEstimator._get_default_requests)
482-
def _get_default_requests(cls):
483-
return cls._cpu_class._get_default_requests()
483+
if SKLEARN_18:
484+
485+
@classmethod
486+
@functools.wraps(
487+
BaseEstimator._get_class_level_metadata_request_values
488+
)
489+
def _get_class_level_metadata_request_values(cls, method):
490+
return cls._cpu_class._get_class_level_metadata_request_values(
491+
method
492+
)
493+
else:
494+
495+
@classmethod
496+
@functools.wraps(BaseEstimator._get_default_requests)
497+
def _get_default_requests(cls):
498+
return cls._cpu_class._get_default_requests()
484499

485500
@property
486501
def _metadata_request(self):

python/cuml/cuml/linear_model/logistic_regression.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#
55
import cupy as cp
66
import numpy as np
7+
import sklearn
8+
from packaging.version import Version
79

810
import cuml.internals
911
from cuml.common.array_descriptor import CumlArrayDescriptor
@@ -25,6 +27,8 @@
2527
from cuml.linear_model.base import LinearClassifierMixin
2628
from cuml.solvers.qn import fit_qn
2729

30+
SKLEARN_18 = Version(sklearn.__version__) >= Version("1.8.0.dev0")
31+
2832

2933
class LogisticRegression(
3034
Base,
@@ -174,27 +178,51 @@ def _params_from_cpu(cls, model):
174178
):
175179
raise UnsupportedOnGPU("`multi_class` is not supported")
176180

181+
penalty = model.penalty
182+
l1_ratio = model.l1_ratio
183+
184+
# `penalty` was deprecated in sklearn 1.8 and will be removed in 1.10
185+
if penalty == "deprecated":
186+
if l1_ratio in (None, 0):
187+
penalty = "l2"
188+
l1_ratio = None
189+
else:
190+
penalty = "elasticnet"
191+
177192
return {
178-
"penalty": model.penalty,
193+
"penalty": penalty,
194+
"l1_ratio": l1_ratio,
179195
"tol": model.tol,
180196
"C": model.C,
181197
"fit_intercept": model.fit_intercept,
182198
"class_weight": model.class_weight,
183199
"max_iter": model.max_iter,
184-
"l1_ratio": model.l1_ratio,
185200
"solver": "qn",
186201
}
187202

188203
def _params_to_cpu(self):
204+
# `penalty` was deprecated in sklearn 1.8 and will be removed in 1.10
205+
if SKLEARN_18:
206+
extra = {
207+
"l1_ratio": {"l1": 1.0, "l2": 0.0, None: 0.0}.get(
208+
self.l1_ratio
209+
),
210+
"C": np.inf if self.penalty is None else self.C,
211+
}
212+
else:
213+
extra = {
214+
"penalty": self.penalty,
215+
"l1_ratio": self.l1_ratio,
216+
"C": self.C,
217+
}
218+
189219
return {
190-
"penalty": self.penalty,
191220
"tol": self.tol,
192-
"C": self.C,
193221
"fit_intercept": self.fit_intercept,
194222
"class_weight": self.class_weight,
195223
"max_iter": self.max_iter,
196-
"l1_ratio": self.l1_ratio,
197224
"solver": "lbfgs" if self.penalty in ("l2", None) else "saga",
225+
**extra,
198226
}
199227

200228
def _attrs_from_cpu(self, model):

python/cuml/cuml_accel_tests/integration/test_elastic_net.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -181,18 +181,3 @@ def test_elasticnet_positive(regression_data):
181181
assert np.all(model.coef_ >= 0), (
182182
"All coefficients should be non-negative when positive=True"
183183
)
184-
185-
186-
def test_elasticnet_warm_start(regression_data):
187-
X, y = regression_data
188-
model = ElasticNet(warm_start=True, random_state=42)
189-
model.fit(X, y)
190-
coef_old = model.coef_.copy()
191-
# Fit again with more iterations
192-
model.set_params(max_iter=2000)
193-
model.fit(X, y)
194-
coef_new = model.coef_
195-
# Coefficients should change after more iterations
196-
assert not np.allclose(coef_old, coef_new), (
197-
"Coefficients should update when warm_start=True"
198-
)

python/cuml/cuml_accel_tests/integration/test_hdbscan_core.py renamed to python/cuml/cuml_accel_tests/integration/test_hdbscan.py

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,17 @@
66
import hdbscan
77
import numpy as np
88
import pytest
9+
import sklearn
10+
from hdbscan import prediction, validity
11+
from packaging.version import Version
912
from sklearn.datasets import make_blobs, make_moons
1013
from sklearn.preprocessing import StandardScaler
1114

15+
if Version(sklearn.__version__) >= Version("1.8.0.dev0"):
16+
pytest.skip(
17+
"hdbscan requires sklearn < 1.8.0.dev0", allow_module_level=True
18+
)
19+
1220

1321
@pytest.fixture(scope="module")
1422
def synthetic_data():
@@ -307,3 +315,179 @@ def test_hdbscan_allow_single_cluster(synthetic_data):
307315
assert n_clusters >= 1, (
308316
"Should allow a single cluster when allow_single_cluster=True"
309317
)
318+
319+
320+
def test_hdbscan_approximate_predict(synthetic_data):
321+
X_train, _ = synthetic_data
322+
X_test, _ = make_blobs(
323+
n_samples=100,
324+
n_features=2,
325+
centers=5,
326+
cluster_std=0.5,
327+
random_state=24,
328+
)
329+
X_test = StandardScaler().fit_transform(X_test)
330+
clusterer = hdbscan.HDBSCAN(prediction_data=True)
331+
clusterer.fit(X_train)
332+
test_labels, strengths = hdbscan.approximate_predict(clusterer, X_test)
333+
# Check that labels are assigned to test data
334+
assert len(test_labels) == X_test.shape[0], (
335+
"Labels should be assigned to test data points"
336+
)
337+
assert len(strengths) == X_test.shape[0], (
338+
"Strengths should be computed for test data points"
339+
)
340+
# Check that strengths are between 0 and 1
341+
assert np.all((strengths >= 0) & (strengths <= 1)), (
342+
"Strengths should be between 0 and 1"
343+
)
344+
345+
346+
def test_hdbscan_membership_vector(synthetic_data):
347+
X_train, _ = synthetic_data
348+
clusterer = hdbscan.HDBSCAN(prediction_data=True)
349+
clusterer.fit(X_train)
350+
point = X_train[0].reshape((1, 2))
351+
hdbscan.membership_vector(clusterer, point)
352+
353+
354+
def test_hdbscan_all_points_membership_vectors(synthetic_data):
355+
X_train, _ = synthetic_data
356+
clusterer = hdbscan.HDBSCAN(prediction_data=True)
357+
clusterer.fit(X_train)
358+
memberships = hdbscan.all_points_membership_vectors(clusterer)
359+
# Check that the number of membership vectors matches the number of samples
360+
assert len(memberships) == X_train.shape[0], (
361+
"There should be a membership vector for each sample"
362+
)
363+
# Check that each membership vector sums to 1
364+
for membership in memberships:
365+
# Check that all probabilities are between 0 and 1
366+
assert all(0.0 <= v <= 1.0 for v in membership), (
367+
"Probabilities should be between 0 and 1"
368+
)
369+
370+
371+
def test_hdbscan_validity_index(synthetic_data):
372+
X, _ = synthetic_data
373+
clusterer = hdbscan.HDBSCAN()
374+
clusterer.fit(X)
375+
score = validity.validity_index(X, clusterer.labels_, metric="euclidean")
376+
# Check that the validity index is a finite number
377+
assert np.isfinite(score), "Validity index should be a finite number"
378+
379+
380+
def test_hdbscan_condensed_tree(synthetic_data):
381+
X, _ = synthetic_data
382+
clusterer = hdbscan.HDBSCAN()
383+
clusterer.fit(X)
384+
condensed_tree = clusterer.condensed_tree_
385+
# Check that the condensed tree has the expected attributes
386+
assert hasattr(condensed_tree, "to_pandas"), (
387+
"Condensed tree should have a 'to_pandas' method"
388+
)
389+
# Convert to pandas DataFrame and check columns
390+
condensed_tree.to_pandas()
391+
392+
393+
def test_hdbscan_single_linkage_tree_attribute(synthetic_data):
394+
X, _ = synthetic_data
395+
clusterer = hdbscan.HDBSCAN()
396+
clusterer.fit(X)
397+
single_linkage_tree = clusterer.single_linkage_tree_
398+
# Check that the single linkage tree has the expected attributes
399+
assert hasattr(single_linkage_tree, "to_numpy"), (
400+
"Single linkage tree should have a 'to_numpy' method"
401+
)
402+
# Convert to NumPy array and check shape
403+
sl_tree_array = single_linkage_tree.to_numpy()
404+
assert sl_tree_array.shape[1] == 4, (
405+
"Single linkage tree array should have 4 columns"
406+
)
407+
408+
409+
def test_hdbscan_flat_clustering(synthetic_data):
410+
X, _ = synthetic_data
411+
clusterer = hdbscan.HDBSCAN()
412+
clusterer.fit(X)
413+
# Extract clusters at a specific cluster_selection_epsilon
414+
clusterer_flat = hdbscan.HDBSCAN(cluster_selection_epsilon=0.1)
415+
clusterer_flat.fit(X)
416+
# Check that clusters are formed
417+
n_clusters_flat = len(set(clusterer_flat.labels_)) - (
418+
1 if -1 in clusterer_flat.labels_ else 0
419+
)
420+
assert n_clusters_flat > 0, "Should find clusters with flat clustering"
421+
422+
423+
def test_hdbscan_prediction_membership_vector(synthetic_data):
424+
X_train, _ = synthetic_data
425+
clusterer = hdbscan.HDBSCAN(prediction_data=True)
426+
clusterer.fit(X_train)
427+
point = X_train[0].reshape((1, 2))
428+
prediction.membership_vector(clusterer, point)
429+
430+
431+
def test_hdbscan_prediction_all_points_membership_vectors(synthetic_data):
432+
X_train, _ = synthetic_data
433+
clusterer = hdbscan.HDBSCAN(prediction_data=True)
434+
clusterer.fit(X_train)
435+
memberships = prediction.all_points_membership_vectors(clusterer)
436+
# Check that the number of membership vectors matches the number of samples
437+
assert len(memberships) == X_train.shape[0], (
438+
"There should be a membership vector for each sample"
439+
)
440+
for membership in memberships:
441+
# Check that all probabilities are between 0 and 1
442+
assert all(0.0 <= v <= 1.0 for v in membership), (
443+
"Probabilities should be between 0 and 1"
444+
)
445+
446+
447+
def test_hdbscan_outlier_exposure(synthetic_data):
448+
# Note: hdbscan may not have a function named 'outlier_exposure'
449+
# This is a placeholder for any outlier detection functionality
450+
X, _ = synthetic_data
451+
clusterer = hdbscan.HDBSCAN()
452+
clusterer.fit(X)
453+
# Check if outlier scores are computed
454+
if hasattr(clusterer, "outlier_scores_"):
455+
outlier_scores = clusterer.outlier_scores_
456+
# Check that outlier scores are finite numbers
457+
assert np.all(np.isfinite(outlier_scores)), (
458+
"Outlier scores should be finite numbers"
459+
)
460+
else:
461+
pytest.skip(
462+
"Outlier exposure functionality is not available in this version of HDBSCAN"
463+
)
464+
465+
466+
# test requires networkx
467+
# def test_hdbscan_extract_single_linkage_tree(synthetic_data):
468+
# X, _ = synthetic_data
469+
# clusterer = hdbscan.HDBSCAN()
470+
# clusterer.fit(X)
471+
# # Extract the single linkage tree
472+
# sl_tree = clusterer.single_linkage_tree_.to_networkx()
473+
# # Check that the tree has the correct number of nodes
474+
# assert sl_tree.number_of_nodes() == X.shape[0], "Single linkage tree should have a node for each data point"
475+
476+
477+
def test_hdbscan_get_exemplars(synthetic_data):
478+
X, _ = synthetic_data
479+
clusterer = hdbscan.HDBSCAN()
480+
clusterer.fit(X)
481+
if hasattr(clusterer, "exemplars_"):
482+
exemplars = clusterer.exemplars_
483+
# Check that exemplars are available for each cluster
484+
n_clusters = len(set(clusterer.labels_)) - (
485+
1 if -1 in clusterer.labels_ else 0
486+
)
487+
assert len(exemplars) == n_clusters, (
488+
"There should be exemplars for each cluster"
489+
)
490+
else:
491+
pytest.skip(
492+
"Exemplar functionality is not available in this version of HDBSCAN"
493+
)

0 commit comments

Comments
 (0)