|
6 | 6 | import hdbscan |
7 | 7 | import numpy as np |
8 | 8 | import pytest |
| 9 | +import sklearn |
| 10 | +from hdbscan import prediction, validity |
| 11 | +from packaging.version import Version |
9 | 12 | from sklearn.datasets import make_blobs, make_moons |
10 | 13 | from sklearn.preprocessing import StandardScaler |
11 | 14 |
|
| 15 | +if Version(sklearn.__version__) >= Version("1.8.0.dev0"): |
| 16 | + pytest.skip( |
| 17 | + "hdbscan requires sklearn < 1.8.0.dev0", allow_module_level=True |
| 18 | + ) |
| 19 | + |
12 | 20 |
|
13 | 21 | @pytest.fixture(scope="module") |
14 | 22 | def synthetic_data(): |
@@ -307,3 +315,179 @@ def test_hdbscan_allow_single_cluster(synthetic_data): |
307 | 315 | assert n_clusters >= 1, ( |
308 | 316 | "Should allow a single cluster when allow_single_cluster=True" |
309 | 317 | ) |
| 318 | + |
| 319 | + |
| 320 | +def test_hdbscan_approximate_predict(synthetic_data): |
| 321 | + X_train, _ = synthetic_data |
| 322 | + X_test, _ = make_blobs( |
| 323 | + n_samples=100, |
| 324 | + n_features=2, |
| 325 | + centers=5, |
| 326 | + cluster_std=0.5, |
| 327 | + random_state=24, |
| 328 | + ) |
| 329 | + X_test = StandardScaler().fit_transform(X_test) |
| 330 | + clusterer = hdbscan.HDBSCAN(prediction_data=True) |
| 331 | + clusterer.fit(X_train) |
| 332 | + test_labels, strengths = hdbscan.approximate_predict(clusterer, X_test) |
| 333 | + # Check that labels are assigned to test data |
| 334 | + assert len(test_labels) == X_test.shape[0], ( |
| 335 | + "Labels should be assigned to test data points" |
| 336 | + ) |
| 337 | + assert len(strengths) == X_test.shape[0], ( |
| 338 | + "Strengths should be computed for test data points" |
| 339 | + ) |
| 340 | + # Check that strengths are between 0 and 1 |
| 341 | + assert np.all((strengths >= 0) & (strengths <= 1)), ( |
| 342 | + "Strengths should be between 0 and 1" |
| 343 | + ) |
| 344 | + |
| 345 | + |
| 346 | +def test_hdbscan_membership_vector(synthetic_data): |
| 347 | + X_train, _ = synthetic_data |
| 348 | + clusterer = hdbscan.HDBSCAN(prediction_data=True) |
| 349 | + clusterer.fit(X_train) |
| 350 | + point = X_train[0].reshape((1, 2)) |
| 351 | + hdbscan.membership_vector(clusterer, point) |
| 352 | + |
| 353 | + |
| 354 | +def test_hdbscan_all_points_membership_vectors(synthetic_data): |
| 355 | + X_train, _ = synthetic_data |
| 356 | + clusterer = hdbscan.HDBSCAN(prediction_data=True) |
| 357 | + clusterer.fit(X_train) |
| 358 | + memberships = hdbscan.all_points_membership_vectors(clusterer) |
| 359 | + # Check that the number of membership vectors matches the number of samples |
| 360 | + assert len(memberships) == X_train.shape[0], ( |
| 361 | + "There should be a membership vector for each sample" |
| 362 | + ) |
| 363 | + # Check that each membership vector sums to 1 |
| 364 | + for membership in memberships: |
| 365 | + # Check that all probabilities are between 0 and 1 |
| 366 | + assert all(0.0 <= v <= 1.0 for v in membership), ( |
| 367 | + "Probabilities should be between 0 and 1" |
| 368 | + ) |
| 369 | + |
| 370 | + |
| 371 | +def test_hdbscan_validity_index(synthetic_data): |
| 372 | + X, _ = synthetic_data |
| 373 | + clusterer = hdbscan.HDBSCAN() |
| 374 | + clusterer.fit(X) |
| 375 | + score = validity.validity_index(X, clusterer.labels_, metric="euclidean") |
| 376 | + # Check that the validity index is a finite number |
| 377 | + assert np.isfinite(score), "Validity index should be a finite number" |
| 378 | + |
| 379 | + |
| 380 | +def test_hdbscan_condensed_tree(synthetic_data): |
| 381 | + X, _ = synthetic_data |
| 382 | + clusterer = hdbscan.HDBSCAN() |
| 383 | + clusterer.fit(X) |
| 384 | + condensed_tree = clusterer.condensed_tree_ |
| 385 | + # Check that the condensed tree has the expected attributes |
| 386 | + assert hasattr(condensed_tree, "to_pandas"), ( |
| 387 | + "Condensed tree should have a 'to_pandas' method" |
| 388 | + ) |
| 389 | + # Convert to pandas DataFrame and check columns |
| 390 | + condensed_tree.to_pandas() |
| 391 | + |
| 392 | + |
| 393 | +def test_hdbscan_single_linkage_tree_attribute(synthetic_data): |
| 394 | + X, _ = synthetic_data |
| 395 | + clusterer = hdbscan.HDBSCAN() |
| 396 | + clusterer.fit(X) |
| 397 | + single_linkage_tree = clusterer.single_linkage_tree_ |
| 398 | + # Check that the single linkage tree has the expected attributes |
| 399 | + assert hasattr(single_linkage_tree, "to_numpy"), ( |
| 400 | + "Single linkage tree should have a 'to_numpy' method" |
| 401 | + ) |
| 402 | + # Convert to NumPy array and check shape |
| 403 | + sl_tree_array = single_linkage_tree.to_numpy() |
| 404 | + assert sl_tree_array.shape[1] == 4, ( |
| 405 | + "Single linkage tree array should have 4 columns" |
| 406 | + ) |
| 407 | + |
| 408 | + |
| 409 | +def test_hdbscan_flat_clustering(synthetic_data): |
| 410 | + X, _ = synthetic_data |
| 411 | + clusterer = hdbscan.HDBSCAN() |
| 412 | + clusterer.fit(X) |
| 413 | + # Extract clusters at a specific cluster_selection_epsilon |
| 414 | + clusterer_flat = hdbscan.HDBSCAN(cluster_selection_epsilon=0.1) |
| 415 | + clusterer_flat.fit(X) |
| 416 | + # Check that clusters are formed |
| 417 | + n_clusters_flat = len(set(clusterer_flat.labels_)) - ( |
| 418 | + 1 if -1 in clusterer_flat.labels_ else 0 |
| 419 | + ) |
| 420 | + assert n_clusters_flat > 0, "Should find clusters with flat clustering" |
| 421 | + |
| 422 | + |
| 423 | +def test_hdbscan_prediction_membership_vector(synthetic_data): |
| 424 | + X_train, _ = synthetic_data |
| 425 | + clusterer = hdbscan.HDBSCAN(prediction_data=True) |
| 426 | + clusterer.fit(X_train) |
| 427 | + point = X_train[0].reshape((1, 2)) |
| 428 | + prediction.membership_vector(clusterer, point) |
| 429 | + |
| 430 | + |
| 431 | +def test_hdbscan_prediction_all_points_membership_vectors(synthetic_data): |
| 432 | + X_train, _ = synthetic_data |
| 433 | + clusterer = hdbscan.HDBSCAN(prediction_data=True) |
| 434 | + clusterer.fit(X_train) |
| 435 | + memberships = prediction.all_points_membership_vectors(clusterer) |
| 436 | + # Check that the number of membership vectors matches the number of samples |
| 437 | + assert len(memberships) == X_train.shape[0], ( |
| 438 | + "There should be a membership vector for each sample" |
| 439 | + ) |
| 440 | + for membership in memberships: |
| 441 | + # Check that all probabilities are between 0 and 1 |
| 442 | + assert all(0.0 <= v <= 1.0 for v in membership), ( |
| 443 | + "Probabilities should be between 0 and 1" |
| 444 | + ) |
| 445 | + |
| 446 | + |
| 447 | +def test_hdbscan_outlier_exposure(synthetic_data): |
| 448 | + # Note: hdbscan may not have a function named 'outlier_exposure' |
| 449 | + # This is a placeholder for any outlier detection functionality |
| 450 | + X, _ = synthetic_data |
| 451 | + clusterer = hdbscan.HDBSCAN() |
| 452 | + clusterer.fit(X) |
| 453 | + # Check if outlier scores are computed |
| 454 | + if hasattr(clusterer, "outlier_scores_"): |
| 455 | + outlier_scores = clusterer.outlier_scores_ |
| 456 | + # Check that outlier scores are finite numbers |
| 457 | + assert np.all(np.isfinite(outlier_scores)), ( |
| 458 | + "Outlier scores should be finite numbers" |
| 459 | + ) |
| 460 | + else: |
| 461 | + pytest.skip( |
| 462 | + "Outlier exposure functionality is not available in this version of HDBSCAN" |
| 463 | + ) |
| 464 | + |
| 465 | + |
| 466 | +# test requires networkx |
| 467 | +# def test_hdbscan_extract_single_linkage_tree(synthetic_data): |
| 468 | +# X, _ = synthetic_data |
| 469 | +# clusterer = hdbscan.HDBSCAN() |
| 470 | +# clusterer.fit(X) |
| 471 | +# # Extract the single linkage tree |
| 472 | +# sl_tree = clusterer.single_linkage_tree_.to_networkx() |
| 473 | +# # Check that the tree has the correct number of nodes |
| 474 | +# assert sl_tree.number_of_nodes() == X.shape[0], "Single linkage tree should have a node for each data point" |
| 475 | + |
| 476 | + |
| 477 | +def test_hdbscan_get_exemplars(synthetic_data): |
| 478 | + X, _ = synthetic_data |
| 479 | + clusterer = hdbscan.HDBSCAN() |
| 480 | + clusterer.fit(X) |
| 481 | + if hasattr(clusterer, "exemplars_"): |
| 482 | + exemplars = clusterer.exemplars_ |
| 483 | + # Check that exemplars are available for each cluster |
| 484 | + n_clusters = len(set(clusterer.labels_)) - ( |
| 485 | + 1 if -1 in clusterer.labels_ else 0 |
| 486 | + ) |
| 487 | + assert len(exemplars) == n_clusters, ( |
| 488 | + "There should be exemplars for each cluster" |
| 489 | + ) |
| 490 | + else: |
| 491 | + pytest.skip( |
| 492 | + "Exemplar functionality is not available in this version of HDBSCAN" |
| 493 | + ) |
0 commit comments