From eedab0843405af8689e482d5d849e86d9923bfd1 Mon Sep 17 00:00:00 2001 From: Jake Beierle Date: Tue, 31 Mar 2026 17:34:55 -0400 Subject: [PATCH] Fix latency feature extraction --- support_code/behavior_summaries.py | 16 ++- tests/support_code/__init__.py | 0 tests/support_code/test_behavior_summaries.py | 132 ++++++++++++++++++ 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 tests/support_code/__init__.py create mode 100644 tests/support_code/test_behavior_summaries.py diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index a24a26f..4b452b9 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -131,6 +131,18 @@ def aggregate_data_by_bin_size( grouped = data.groupby("MouseID") filtered_data = pd.concat([group.iloc[:bin_size] for _, group in grouped]) + # Extract latency values before summing. agg with a positional lambda + # preserves NaN (unlike first()/last() which skip NaN), returns a Series + # indexed by MouseID for correct alignment with aggregated. + latency_first_col = f"{behavior}_latency_to_first_prediction" + latency_last_col = f"{behavior}_latency_to_last_prediction" + latency_first = filtered_data.groupby("MouseID")[latency_first_col].agg( + lambda s: s.iloc[0] + ) + latency_last = filtered_data.groupby("MouseID")[latency_last_col].agg( + lambda s: s.iloc[-1] + ) + # Aggregate numeric columns by summing them numeric_cols = filtered_data.select_dtypes(include=["number"]).columns aggregated = filtered_data.groupby("MouseID")[numeric_cols].sum() @@ -181,10 +193,10 @@ def aggregate_data_by_bin_size( # TODO: var and std need to be aggregated across bins. # This is non-trivial because of the partial bouts and their associated weights. aggregated[f"bin_first_{bin_size * 5}.{behavior}_latency_first_prediction"] = ( - aggregated[f"{behavior}_latency_to_first_prediction"].head(1) + latency_first ) aggregated[f"bin_last_{bin_size * 5}.{behavior}_latency_last_prediction"] = ( - aggregated[f"{behavior}_latency_to_last_prediction"].tail(1) + latency_last ) # Reset index to make MouseID a regular column diff --git a/tests/support_code/__init__.py b/tests/support_code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py new file mode 100644 index 0000000..ef5f594 --- /dev/null +++ b/tests/support_code/test_behavior_summaries.py @@ -0,0 +1,132 @@ +"""Unit tests for support_code/behavior_summaries.py.""" + +import math +import sys +from pathlib import Path + +import pandas as pd +import pytest + +# behavior_summaries.py lives in support_code/, which is not a package. +# Add it to sys.path so we can import it directly. +sys.path.insert(0, str(Path(__file__).parents[2] / "support_code")) + +import behavior_summaries # noqa: E402 + + +BEHAVIOR = "Jumping" + + +def _make_filtered_data( + latency_first_values: list, + latency_last_values: list, + mouse_id: str = "mouse_A", +) -> pd.DataFrame: + """Build a minimal per-bin DataFrame matching the shape expected by aggregate_data_by_bin_size.""" + n = len(latency_first_values) + return pd.DataFrame( + { + "MouseID": [mouse_id] * n, + f"{BEHAVIOR}_latency_to_first_prediction": latency_first_values, + f"{BEHAVIOR}_latency_to_last_prediction": latency_last_values, + f"{BEHAVIOR}_time_behavior": [100.0] * n, + f"{BEHAVIOR}_time_not_behavior": [200.0] * n, + f"{BEHAVIOR}_behavior_dist": [50.0] * n, + f"{BEHAVIOR}_behavior_dist_threshold": [10.0] * n, + f"{BEHAVIOR}_behavior_dist_seg": [5.0] * n, + f"{BEHAVIOR}_bout_behavior": [2] * n, + f"{BEHAVIOR}_avg_bout_duration": [1.5] * n, + f"{BEHAVIOR}__stats_sample_count": [2] * n, + f"{BEHAVIOR}_bout_duration_std": [0.1] * n, + f"{BEHAVIOR}_bout_duration_var": [0.01] * n, + } + ) + + +class TestLatencyFirstPrediction: + def test_returns_first_bin_value_when_present(self): + """latency_first should be the first bin's value, not a cumulative sum.""" + data = _make_filtered_data( + latency_first_values=[2506.0, 9412.0, 18082.0, float("nan")], + latency_last_values=[4900.0, 11000.0, 19000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(2506.0) + + def test_returns_nan_when_first_bin_has_no_behavior(self): + """latency_first should be NaN when the first bin has no behavior, not a later bin's value.""" + data = _make_filtered_data( + latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], + latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_single_bin_returns_that_bins_value(self): + data = _make_filtered_data( + latency_first_values=[2506.0], + latency_last_values=[4900.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(2506.0) + + +class TestLatencyLastPrediction: + def test_returns_last_bin_value_when_present(self): + """latency_last should be the last bin's value, not a cumulative sum.""" + data = _make_filtered_data( + latency_first_values=[2506.0, 9412.0, 18082.0, 38222.0], + latency_last_values=[4900.0, 11000.0, 19000.0, 45000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(45000.0) + + def test_returns_nan_when_last_bin_has_no_behavior(self): + """latency_last should be NaN when the last bin has no behavior, not a previous bin's value.""" + data = _make_filtered_data( + latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], + latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_single_bin_returns_that_bins_value(self): + data = _make_filtered_data( + latency_first_values=[2506.0], + latency_last_values=[4900.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + col = f"bin_last_5.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(4900.0) + + +class TestMultiMouseAlignment: + def test_each_mouse_gets_its_own_first_latency(self): + """With multiple mice, each should receive their own first-bin latency value.""" + mouse_a = _make_filtered_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 11000.0], + mouse_id="mouse_A", + ) + mouse_b = _make_filtered_data( + latency_first_values=[float("nan"), 5000.0], + latency_last_values=[float("nan"), 8000.0], + mouse_id="mouse_B", + ) + data = pd.concat([mouse_a, mouse_b], ignore_index=True) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = result.set_index("MouseID") + + first_col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" + last_col = f"bin_last_10.{BEHAVIOR}_latency_last_prediction" + + assert result.loc["mouse_A", first_col] == pytest.approx(2506.0) + assert math.isnan(result.loc["mouse_B", first_col]) + + assert result.loc["mouse_A", last_col] == pytest.approx(11000.0) + assert result.loc["mouse_B", last_col] == pytest.approx(8000.0)