rapidsai
diff --git a/‎docs/source/cuml-accel/limitations.rst‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/cuml-accel/limitations.rst‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/cuml/cuml/accel/estimator_proxy.py‎
Lines changed: 19 additions & 4 deletions b/‎python/cuml/cuml/accel/estimator_proxy.py‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎python/cuml/cuml/linear_model/logistic_regression.py‎
Lines changed: 33 additions & 5 deletions b/‎python/cuml/cuml/linear_model/logistic_regression.py‎
Lines changed: 33 additions & 5 deletions
diff --git a/‎python/cuml/cuml_accel_tests/integration/test_elastic_net.py‎
Lines changed: 0 additions & 15 deletions b/‎python/cuml/cuml_accel_tests/integration/test_elastic_net.py‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎python/cuml/cuml_accel_tests/integration/test_hdbscan_core.py‎ renamed to ‎python/cuml/cuml_accel_tests/integration/test_hdbscan.py‎
Lines changed: 184 additions & 0 deletions b/‎python/cuml/cuml_accel_tests/integration/test_hdbscan_core.py‎ renamed to ‎python/cuml/cuml_accel_tests/integration/test_hdbscan.py‎
Lines changed: 184 additions & 0 deletions
@@ -52,9 +52,9 @@ A few additional general notes:
   or categorical formats (e.g., using scikit-learn's LabelEncoder) prior to
   processing.
 
-- The accelerator is compatible with scikit-learn version 1.4 or higher. This
-  compatibility ensures that cuML's implementation of scikit-learn compatible
-  APIs works as expected.
+- The accelerator is tested to be compatible with scikit-learn versions 1.4
+  through 1.7. This ensures that cuML's implementation of scikit-learn
+  compatible APIs works as expected.
 
 - Error and warning messages and formats may differ from scikit-learn. Some
   errors might present as C++ stacktraces instead of python errors.
 
@@ -7,13 +7,16 @@
 from typing import Any
 
 import sklearn
+from packaging.version import Version
 from sklearn.base import BaseEstimator, ClassNamePrefixFeaturesOutMixin
 from sklearn.utils._set_output import _wrap_data_with_container
 
 from cuml.accel import profilers
 from cuml.accel.core import logger
 from cuml.internals.interop import UnsupportedOnGPU, is_fitted
 
+SKLEARN_18 = Version(sklearn.__version__) >= Version("1.8.0.dev0")
+
 
 def is_proxy(instance_or_class) -> bool:
     """Check if an instance or class is a proxy object created by the accelerator."""
@@ -477,10 +480,22 @@ def get_metadata_routing(self):
     def _get_metadata_request(self):
         return self._cpu._get_metadata_request()
 
-    @classmethod
-    @functools.wraps(BaseEstimator._get_default_requests)
-    def _get_default_requests(cls):
-        return cls._cpu_class._get_default_requests()
+    if SKLEARN_18:
+
+        @classmethod
+        @functools.wraps(
+            BaseEstimator._get_class_level_metadata_request_values
+        )
+        def _get_class_level_metadata_request_values(cls, method):
+            return cls._cpu_class._get_class_level_metadata_request_values(
+                method
+            )
+    else:
+
+        @classmethod
+        @functools.wraps(BaseEstimator._get_default_requests)
+        def _get_default_requests(cls):
+            return cls._cpu_class._get_default_requests()
 
     @property
     def _metadata_request(self):
 
@@ -4,6 +4,8 @@
 #
 import cupy as cp
 import numpy as np
+import sklearn
+from packaging.version import Version
 
 import cuml.internals
 from cuml.common.array_descriptor import CumlArrayDescriptor
@@ -25,6 +27,8 @@
 from cuml.linear_model.base import LinearClassifierMixin
 from cuml.solvers.qn import fit_qn
 
+SKLEARN_18 = Version(sklearn.__version__) >= Version("1.8.0.dev0")
+
 
 class LogisticRegression(
     Base,
@@ -174,27 +178,51 @@ def _params_from_cpu(cls, model):
         ):
             raise UnsupportedOnGPU("`multi_class` is not supported")
 
+        penalty = model.penalty
+        l1_ratio = model.l1_ratio
+
+        # `penalty` was deprecated in sklearn 1.8 and will be removed in 1.10
+        if penalty == "deprecated":
+            if l1_ratio in (None, 0):
+                penalty = "l2"
+                l1_ratio = None
+            else:
+                penalty = "elasticnet"
+
         return {
-            "penalty": model.penalty,
+            "penalty": penalty,
+            "l1_ratio": l1_ratio,
             "tol": model.tol,
             "C": model.C,
             "fit_intercept": model.fit_intercept,
             "class_weight": model.class_weight,
             "max_iter": model.max_iter,
-            "l1_ratio": model.l1_ratio,
             "solver": "qn",
         }
 
     def _params_to_cpu(self):
+        # `penalty` was deprecated in sklearn 1.8 and will be removed in 1.10
+        if SKLEARN_18:
+            extra = {
+                "l1_ratio": {"l1": 1.0, "l2": 0.0, None: 0.0}.get(
+                    self.l1_ratio
+                ),
+                "C": np.inf if self.penalty is None else self.C,
+            }
+        else:
+            extra = {
+                "penalty": self.penalty,
+                "l1_ratio": self.l1_ratio,
+                "C": self.C,
+            }
+
         return {
-            "penalty": self.penalty,
             "tol": self.tol,
-            "C": self.C,
             "fit_intercept": self.fit_intercept,
             "class_weight": self.class_weight,
             "max_iter": self.max_iter,
-            "l1_ratio": self.l1_ratio,
             "solver": "lbfgs" if self.penalty in ("l2", None) else "saga",
+            **extra,
         }
 
     def _attrs_from_cpu(self, model):
 
@@ -181,18 +181,3 @@ def test_elasticnet_positive(regression_data):
     assert np.all(model.coef_ >= 0), (
         "All coefficients should be non-negative when positive=True"
     )
-
-
-def test_elasticnet_warm_start(regression_data):
-    X, y = regression_data
-    model = ElasticNet(warm_start=True, random_state=42)
-    model.fit(X, y)
-    coef_old = model.coef_.copy()
-    # Fit again with more iterations
-    model.set_params(max_iter=2000)
-    model.fit(X, y)
-    coef_new = model.coef_
-    # Coefficients should change after more iterations
-    assert not np.allclose(coef_old, coef_new), (
-        "Coefficients should update when warm_start=True"
-    )
@@ -6,9 +6,17 @@
 import hdbscan
 import numpy as np
 import pytest
+import sklearn
+from hdbscan import prediction, validity
+from packaging.version import Version
 from sklearn.datasets import make_blobs, make_moons
 from sklearn.preprocessing import StandardScaler
 
+if Version(sklearn.__version__) >= Version("1.8.0.dev0"):
+    pytest.skip(
+        "hdbscan requires sklearn < 1.8.0.dev0", allow_module_level=True
+    )
+
 
 @pytest.fixture(scope="module")
 def synthetic_data():
@@ -307,3 +315,179 @@ def test_hdbscan_allow_single_cluster(synthetic_data):
     assert n_clusters >= 1, (
         "Should allow a single cluster when allow_single_cluster=True"
     )
+
+
+def test_hdbscan_approximate_predict(synthetic_data):
+    X_train, _ = synthetic_data
+    X_test, _ = make_blobs(
+        n_samples=100,
+        n_features=2,
+        centers=5,
+        cluster_std=0.5,
+        random_state=24,
+    )
+    X_test = StandardScaler().fit_transform(X_test)
+    clusterer = hdbscan.HDBSCAN(prediction_data=True)
+    clusterer.fit(X_train)
+    test_labels, strengths = hdbscan.approximate_predict(clusterer, X_test)
+    # Check that labels are assigned to test data
+    assert len(test_labels) == X_test.shape[0], (
+        "Labels should be assigned to test data points"
+    )
+    assert len(strengths) == X_test.shape[0], (
+        "Strengths should be computed for test data points"
+    )
+    # Check that strengths are between 0 and 1
+    assert np.all((strengths >= 0) & (strengths <= 1)), (
+        "Strengths should be between 0 and 1"
+    )
+
+
+def test_hdbscan_membership_vector(synthetic_data):
+    X_train, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN(prediction_data=True)
+    clusterer.fit(X_train)
+    point = X_train[0].reshape((1, 2))
+    hdbscan.membership_vector(clusterer, point)
+
+
+def test_hdbscan_all_points_membership_vectors(synthetic_data):
+    X_train, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN(prediction_data=True)
+    clusterer.fit(X_train)
+    memberships = hdbscan.all_points_membership_vectors(clusterer)
+    # Check that the number of membership vectors matches the number of samples
+    assert len(memberships) == X_train.shape[0], (
+        "There should be a membership vector for each sample"
+    )
+    # Check that each membership vector sums to 1
+    for membership in memberships:
+        # Check that all probabilities are between 0 and 1
+        assert all(0.0 <= v <= 1.0 for v in membership), (
+            "Probabilities should be between 0 and 1"
+        )
+
+
+def test_hdbscan_validity_index(synthetic_data):
+    X, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN()
+    clusterer.fit(X)
+    score = validity.validity_index(X, clusterer.labels_, metric="euclidean")
+    # Check that the validity index is a finite number
+    assert np.isfinite(score), "Validity index should be a finite number"
+
+
+def test_hdbscan_condensed_tree(synthetic_data):
+    X, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN()
+    clusterer.fit(X)
+    condensed_tree = clusterer.condensed_tree_
+    # Check that the condensed tree has the expected attributes
+    assert hasattr(condensed_tree, "to_pandas"), (
+        "Condensed tree should have a 'to_pandas' method"
+    )
+    # Convert to pandas DataFrame and check columns
+    condensed_tree.to_pandas()
+
+
+def test_hdbscan_single_linkage_tree_attribute(synthetic_data):
+    X, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN()
+    clusterer.fit(X)
+    single_linkage_tree = clusterer.single_linkage_tree_
+    # Check that the single linkage tree has the expected attributes
+    assert hasattr(single_linkage_tree, "to_numpy"), (
+        "Single linkage tree should have a 'to_numpy' method"
+    )
+    # Convert to NumPy array and check shape
+    sl_tree_array = single_linkage_tree.to_numpy()
+    assert sl_tree_array.shape[1] == 4, (
+        "Single linkage tree array should have 4 columns"
+    )
+
+
+def test_hdbscan_flat_clustering(synthetic_data):
+    X, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN()
+    clusterer.fit(X)
+    # Extract clusters at a specific cluster_selection_epsilon
+    clusterer_flat = hdbscan.HDBSCAN(cluster_selection_epsilon=0.1)
+    clusterer_flat.fit(X)
+    # Check that clusters are formed
+    n_clusters_flat = len(set(clusterer_flat.labels_)) - (
+        1 if -1 in clusterer_flat.labels_ else 0
+    )
+    assert n_clusters_flat > 0, "Should find clusters with flat clustering"
+
+
+def test_hdbscan_prediction_membership_vector(synthetic_data):
+    X_train, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN(prediction_data=True)
+    clusterer.fit(X_train)
+    point = X_train[0].reshape((1, 2))
+    prediction.membership_vector(clusterer, point)
+
+
+def test_hdbscan_prediction_all_points_membership_vectors(synthetic_data):
+    X_train, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN(prediction_data=True)
+    clusterer.fit(X_train)
+    memberships = prediction.all_points_membership_vectors(clusterer)
+    # Check that the number of membership vectors matches the number of samples
+    assert len(memberships) == X_train.shape[0], (
+        "There should be a membership vector for each sample"
+    )
+    for membership in memberships:
+        # Check that all probabilities are between 0 and 1
+        assert all(0.0 <= v <= 1.0 for v in membership), (
+            "Probabilities should be between 0 and 1"
+        )
+
+
+def test_hdbscan_outlier_exposure(synthetic_data):
+    # Note: hdbscan may not have a function named 'outlier_exposure'
+    # This is a placeholder for any outlier detection functionality
+    X, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN()
+    clusterer.fit(X)
+    # Check if outlier scores are computed
+    if hasattr(clusterer, "outlier_scores_"):
+        outlier_scores = clusterer.outlier_scores_
+        # Check that outlier scores are finite numbers
+        assert np.all(np.isfinite(outlier_scores)), (
+            "Outlier scores should be finite numbers"
+        )
+    else:
+        pytest.skip(
+            "Outlier exposure functionality is not available in this version of HDBSCAN"
+        )
+
+
+# test requires networkx
+# def test_hdbscan_extract_single_linkage_tree(synthetic_data):
+#     X, _ = synthetic_data
+#     clusterer = hdbscan.HDBSCAN()
+#     clusterer.fit(X)
+#     # Extract the single linkage tree
+#     sl_tree = clusterer.single_linkage_tree_.to_networkx()
+#     # Check that the tree has the correct number of nodes
+#     assert sl_tree.number_of_nodes() == X.shape[0], "Single linkage tree should have a node for each data point"
+
+
+def test_hdbscan_get_exemplars(synthetic_data):
+    X, _ = synthetic_data
+    clusterer = hdbscan.HDBSCAN()
+    clusterer.fit(X)
+    if hasattr(clusterer, "exemplars_"):
+        exemplars = clusterer.exemplars_
+        # Check that exemplars are available for each cluster
+        n_clusters = len(set(clusterer.labels_)) - (
+            1 if -1 in clusterer.labels_ else 0
+        )
+        assert len(exemplars) == n_clusters, (
+            "There should be exemplars for each cluster"
+        )
+    else:
+        pytest.skip(
+            "Exemplar functionality is not available in this version of HDBSCAN"
+        )