From c87135dab0da86209129025cfed2f07e397b27ce Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Mon, 27 Apr 2026 12:02:31 +0100 Subject: [PATCH 1/2] Calibrate combined self-employed NICs target and skip inert Class 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two NI calibration gaps surfaced during issue audit (#88) and bug report #378: 1. Recent OBR EFOs (e.g. March 2026) publish a single combined "Class 4 and Class 2 Self employed NICs" line instead of two separate rows. The parser's Class 2 / Class 4 candidate labels no longer matched, so neither target was registered and self-employed NICs were silently uncalibrated against OBR. 2. ni_class_3 is an input variable in PolicyEngine UK with no formula and no dataset path that populates it. The matrix column is therefore a flat zero, calibration cannot move it, and the diagnostic that "the target is included" is misleading. This commit: - Adds an obr/ni_self_employed target whose values come from the combined EFO line and whose matrix column is computed via a new custom_compute that sums ni_class_2 + ni_class_4 at the household level. Smoke-build on enhanced_frs_2023_24.h5 with year=2025: 6,848 non-zero households, target £2.90bn. - Keeps the legacy Class 2 / Class 4 candidate labels around so older or future EFOs that revert to separate rows still produce individual targets. - Removes the ni_class_3 entry from _parse_nics with a comment pointing at #378 and the conditions for restoring it (a Class 3 imputation that addresses #88 in full). Tests cover both layers: - test_obr_nics.py: parser handles the combined EFO layout, the legacy separate layout, and intentionally drops Class 3 in either format. - test_obr_nic_signal.py: the registered targets are present in the registry, the combined target carries a custom_compute callable, ni_class_3 is absent, and (gated on enhanced_frs) each underlying PE-UK NI variable produces non-zero variation while ni_class_3 returns a uniform zero — the very property that makes it inert as a calibration target. Closes #378. Partial close of #88 — Class 3 imputation remains a separate follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/378.md | 1 + policyengine_uk_data/targets/sources/obr.py | 73 +++++-- .../tests/test_obr_nic_signal.py | 138 ++++++++++++ policyengine_uk_data/tests/test_obr_nics.py | 203 ++++++++++++------ 4 files changed, 324 insertions(+), 91 deletions(-) create mode 100644 changelog.d/378.md create mode 100644 policyengine_uk_data/tests/test_obr_nic_signal.py diff --git a/changelog.d/378.md b/changelog.d/378.md new file mode 100644 index 00000000..9f235903 --- /dev/null +++ b/changelog.d/378.md @@ -0,0 +1 @@ +- Calibrate against self-employed NICs (combined OBR Class 2 + Class 4 line) and skip the inert Class 3 target so calibration diagnostics are no longer misleading (#378, partial close of #88). diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py index ff577f48..693fb988 100644 --- a/policyengine_uk_data/targets/sources/obr.py +++ b/policyengine_uk_data/targets/sources/obr.py @@ -249,6 +249,21 @@ def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: ["Class 1 Employer NICs"], "ni_employer", ), + # Self-employed NICs. + # + # Recent OBR EFOs (e.g. March 2026) publish a single combined + # "Class 4 and Class 2 Self employed NICs" line rather than two + # separate rows. Earlier EFOs split them. We list the combined + # label first; if a future EFO reverts to separate rows, the + # legacy ``ni_class_2`` / ``ni_class_4`` entries below will pick + # them up instead. + "ni_self_employed": ( + [ + "Class 4 and Class 2 Self employed NICs", + "Class 2 and Class 4 Self employed NICs", + ], + "ni_self_employed", + ), "ni_class_2": ( [ "Class 2 NICs", @@ -257,14 +272,14 @@ def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: ], "ni_class_2", ), - "ni_class_3": ( - [ - "Class 3 NICs", - "Class 3 Voluntary NICs", - "Class 3 voluntary NICs", - ], - "ni_class_3", - ), + # Class 3 NICs are voluntary contributions to fill state-pension + # record gaps. The PE-UK variable ni_class_3 is an input with no + # formula and no dataset path populates it, so a calibration target + # would fall through to a flat-zero matrix column with no signal — + # see issue #378. Class 3 is also small (~£50m vs ~£150bn total + # NICs, ~0.03%), so the calibrator cannot meaningfully match it + # without a record-level imputation. Skipped here pending an + # imputation that addresses #88; restore the row when one lands. "ni_class_4": ( [ "Class 4 NICs", @@ -296,22 +311,40 @@ def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: continue values = _read_row_values(ws, row_num, cols) - if values: - targets.append( - Target( - name=f"obr/{name}", - variable=variable, - source="obr", - unit=Unit.GBP, - values=values, - reference_url=ref, - forecast_vintage=vintage, - ) - ) + if not values: + continue + + kwargs: dict = { + "name": f"obr/{name}", + "variable": variable, + "source": "obr", + "unit": Unit.GBP, + "values": values, + "reference_url": ref, + "forecast_vintage": vintage, + } + # The combined self-employed total has no PE-UK variable of its + # own; sum the two underlying classes via custom_compute so the + # target hits a meaningful matrix column. + if name == "ni_self_employed": + kwargs["custom_compute"] = _compute_ni_self_employed_combined + + targets.append(Target(**kwargs)) return targets +def _compute_ni_self_employed_combined(ctx, target, year): # noqa: ARG001 + """Sum Class 2 and Class 4 NICs at the household level. + + Used as the ``custom_compute`` for the combined OBR target whenever + the EFO publishes Class 2 + Class 4 as a single self-employed line. + """ + class_2 = ctx.sim.calculate("ni_class_2") + class_4 = ctx.sim.calculate("ni_class_4") + return ctx.household_from_person(class_2 + class_4) + + def _parse_welfare(wb: openpyxl.Workbook) -> list[Target]: """Parse Table 4.9 (welfare spending) from expenditure xlsx.""" config = load_config() diff --git a/policyengine_uk_data/tests/test_obr_nic_signal.py b/policyengine_uk_data/tests/test_obr_nic_signal.py new file mode 100644 index 00000000..60711234 --- /dev/null +++ b/policyengine_uk_data/tests/test_obr_nic_signal.py @@ -0,0 +1,138 @@ +"""Regression tests for OBR NIC calibration signal. + +Issue #378 (closing) and partial close of #88: every active OBR NIC +target must produce a non-trivial calibration matrix column. + +Active targets (against the March 2026 EFO format): + +- ``obr/ni_employee`` — Class 1 employee, formula-derived in PE-UK. +- ``obr/ni_employer`` — Class 1 employer, formula-derived in PE-UK. +- ``obr/ni_self_employed`` — combined Class 2 + Class 4, computed via + ``custom_compute`` since recent EFOs publish a single self-employed + line. Sums the two PE-UK variables at the household level. + +Class 3 is intentionally absent because no dataset populates +``ni_class_3`` — the matrix column would be a flat zero. + +Two layers: + +1. Registry layer (no Microsimulation) — assert the three expected NIC + targets are present in the OBR target set and Class 3 is absent. +2. Signal layer (gated on the enhanced FRS fixture) — assert each + active NIC variable produces non-zero variation across households, + and that Class 3 would not. Catches future regressions where someone + re-adds the target without an accompanying imputation. +""" + +from __future__ import annotations + +import numpy as np +import pytest + + +_ACTIVE_TOPLINE_TARGET_NAMES = ( + "obr/ni_employee", + "obr/ni_employer", + "obr/ni_self_employed", +) +_PE_UK_NIC_VARIABLES_WITH_SIGNAL = ( + "ni_employee", + "ni_employer", + "ni_class_2", + "ni_class_4", +) + + +# ── Layer 1: registry contract ────────────────────────────────────── + + +def test_obr_nic_target_registry_includes_active_classes(): + """The OBR target source must emit the three top-line NIC class targets.""" + from policyengine_uk_data.targets import get_all_targets + + expected = set(_ACTIVE_TOPLINE_TARGET_NAMES) + actual = {t.name for t in get_all_targets() if t.name in expected} + assert actual == expected, f"Missing OBR NIC targets: {expected - actual}" + + +def test_obr_ni_self_employed_target_uses_custom_compute(): + """The combined self-employed target is virtual — it must carry a + custom_compute callable, otherwise the loss matrix tries to look up a + non-existent ``ni_self_employed`` PE-UK variable and emits an empty + column.""" + from policyengine_uk_data.targets import get_all_targets + + target = next( + (t for t in get_all_targets() if t.name == "obr/ni_self_employed"), + None, + ) + assert target is not None, "obr/ni_self_employed not registered" + assert callable(target.custom_compute), ( + "obr/ni_self_employed must carry a custom_compute (sum of class 2 + 4)" + ) + + +def test_obr_ni_class_3_target_is_intentionally_absent(): + """Class 3 must not appear in the registered targets (#378).""" + from policyengine_uk_data.targets import get_all_targets + + obr_targets = [t for t in get_all_targets() if t.source == "obr"] + assert "obr/ni_class_3" not in {t.name for t in obr_targets} + assert "ni_class_3" not in {t.variable for t in obr_targets} + + +# ── Layer 2: simulator signal ─────────────────────────────────────── + + +@pytest.mark.parametrize("variable", _PE_UK_NIC_VARIABLES_WITH_SIGNAL) +def test_active_nic_variable_has_nonzero_variation(enhanced_frs, variable): + """Each active NIC variable must produce variation across households, + otherwise the calibration matrix column is a flat constant and the + optimiser cannot match its target.""" + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=enhanced_frs) + sim.default_calculation_period = enhanced_frs.time_period + values = sim.calculate(variable).values + + nonzero = int((values != 0).sum()) + assert nonzero > 0, f"{variable}: all values are zero — calibration would be inert" + assert float(values.var()) > 0.0, ( + f"{variable}: zero variance — calibration cannot move it" + ) + + +def test_self_employed_combined_compute_returns_nonzero(enhanced_frs): + """``_compute_ni_self_employed_combined`` must return a non-zero + household-level vector — i.e. the sum of Class 2 and Class 4 actually + has signal in the dataset, not just each component individually.""" + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=enhanced_frs) + sim.default_calculation_period = enhanced_frs.time_period + + combined_person = ( + sim.calculate("ni_class_2").values + sim.calculate("ni_class_4").values + ) + combined_household = sim.map_result(combined_person, "person", "household") + + assert (combined_household != 0).sum() > 0 + assert float(np.asarray(combined_household).var()) > 0.0 + + +def test_ni_class_3_simulator_returns_uniform_zero(enhanced_frs): + """Direct evidence for why Class 3 is excluded: the simulator produces + a flat-zero vector, so any calibration target on it is inert. If this + ever stops being true (e.g. policyengine-uk adds a formula or this + repo adds an imputation), the Class 3 target should be re-enabled in + obr.py and the corresponding skip removed.""" + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=enhanced_frs) + sim.default_calculation_period = enhanced_frs.time_period + values = sim.calculate("ni_class_3").values + + assert (values == 0).all(), ( + f"ni_class_3 has {(values != 0).sum()} non-zero entries — " + "if intended, restore the target in obr.py and update this test." + ) diff --git a/policyengine_uk_data/tests/test_obr_nics.py b/policyengine_uk_data/tests/test_obr_nics.py index aae94c07..c1c3ec7b 100644 --- a/policyengine_uk_data/tests/test_obr_nics.py +++ b/policyengine_uk_data/tests/test_obr_nics.py @@ -1,13 +1,24 @@ -"""Regression test for OBR NIC target parsing. - -Bug-hunt finding U8: `_parse_nics` only covered Class 1 employee and -employer NICs, omitting Classes 2 (self-employed flat-rate), 3 (voluntary) -and 4 (self-employed profit-based). Calibration had no target for those -receipts and pushed self-employment income downward to compensate. - -This test drives the parser with a minimal in-memory openpyxl workbook -that mimics the OBR EFO Table 3.4 layout, and asserts that targets for -all five NIC variables are produced. +"""Regression tests for OBR NIC target parsing. + +Bug-hunt finding U8: ``_parse_nics`` originally only covered Class 1 +employee and employer NICs, omitting Classes 2 (self-employed flat-rate), +3 (voluntary) and 4 (self-employed profit-based). Calibration had no +target for those receipts and pushed self-employment income downward to +compensate. + +Follow-up (issue #378, partial close of #88): + +- **Class 3** is voluntary contributions paid by people topping up + their state-pension record. PolicyEngine UK exposes ``ni_class_3`` + as an input variable, but no dataset builder, imputation, or utility + populates it — so a Class 3 target would be a flat-zero matrix + column with no signal. The parser therefore drops it. +- **Classes 2 + 4** are bundled in recent OBR EFOs (e.g. March 2026) + as a single "Class 4 and Class 2 Self employed NICs" line. The + parser emits a combined ``obr/ni_self_employed`` target that uses + ``custom_compute`` to sum the two PE-UK variables. If a future EFO + reverts to separate rows, the legacy ``ni_class_2`` / ``ni_class_4`` + candidates still match and emit individually. """ from __future__ import annotations @@ -22,104 +33,154 @@ import openpyxl # noqa: E402 +_FAKE_CONFIG = { + "obr": { + "vintage": "test", + "efo_receipts": "https://example.invalid/receipts", + "efo_expenditure": "https://example.invalid/expenditure", + } +} + -def _build_fake_obr_workbook() -> openpyxl.Workbook: - """Create a stand-in for OBR EFO receipts with a minimal Table 3.4.""" +def _populate_sheet(ws, rows: list[tuple[str, list[float]]]) -> None: + """Write rows of (label, fy_values) into column B / C-I of ``ws``.""" + for row_idx, (label, values) in enumerate(rows, start=2): + ws.cell(row=row_idx, column=2, value=label) + for col_idx, value in enumerate(values, start=3): + ws.cell(row=row_idx, column=col_idx, value=value) + + +def _build_combined_efo_workbook() -> openpyxl.Workbook: + """Mimic the March 2026+ EFO Table 3.4 layout with combined SE NICs.""" wb = openpyxl.Workbook() ws = wb.active ws.title = "3.4" + _populate_sheet( + ws, + [ + ("Class 1 Employee NICs", [120.0] * 7), + ("Class 1 Employer NICs", [140.0] * 7), + ("Class 4 and Class 2 Self employed NICs", [4.8] * 7), + # Class 3 is bundled into "Other NIC" in current EFOs and is + # not directly extractable; included here only to confirm the + # parser does not emit a Class 3 target. + ("Other NIC", [0.5] * 7), + ], + ) + return wb - # Column B holds row labels; columns C-I hold FY values in £bn. - rows = [ - ("Class 1 Employee NICs", [120.0] * 7), - ("Class 1 Employer NICs", [140.0] * 7), - ("Class 2 NICs", [0.3] * 7), - ("Class 3 NICs", [0.05] * 7), - ("Class 4 NICs", [4.5] * 7), - ] - for row_idx, (label, values) in enumerate(rows, start=2): - ws.cell(row=row_idx, column=2, value=label) # col B - for col_idx, value in enumerate(values, start=3): # cols C-I - ws.cell(row=row_idx, column=col_idx, value=value) +def _build_separate_efo_workbook() -> openpyxl.Workbook: + """Mimic an older EFO layout with separate Class 2 / Class 4 rows.""" + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "3.4" + _populate_sheet( + ws, + [ + ("Class 1 Employee NICs", [120.0] * 7), + ("Class 1 Employer NICs", [140.0] * 7), + ("Class 2 NICs", [0.3] * 7), + ("Class 3 NICs", [0.05] * 7), + ("Class 4 NICs", [4.5] * 7), + ], + ) return wb -def test_parse_nics_covers_all_five_classes(): +def test_parse_nics_combined_self_employed_line(): + """Current EFO format: combined Class 2+4 line gets a single target with + custom_compute, alongside Class 1 employee/employer.""" from policyengine_uk_data.targets.sources import obr as obr_module - wb = _build_fake_obr_workbook() + with patch.object(obr_module, "load_config", return_value=_FAKE_CONFIG): + targets = obr_module._parse_nics(_build_combined_efo_workbook()) - fake_config = { - "obr": { - "vintage": "test", - "efo_receipts": "https://example.invalid/receipts", - "efo_expenditure": "https://example.invalid/expenditure", - } + names = {t.name for t in targets} + assert names == { + "obr/ni_employee", + "obr/ni_employer", + "obr/ni_self_employed", } - with patch.object(obr_module, "load_config", return_value=fake_config): - targets = obr_module._parse_nics(wb) + + combined = next(t for t in targets if t.name == "obr/ni_self_employed") + assert combined.variable == "ni_self_employed" + assert combined.values[2024] == pytest.approx(4.8e9) + # The combined target is virtual — it must carry a custom_compute that + # sums the two underlying PE-UK variables. Without it the loss matrix + # falls through to the simple-GBP path and looks for a non-existent + # `ni_self_employed` PE-UK variable. + assert callable(combined.custom_compute) + + +def test_parse_nics_falls_back_to_separate_classes_for_old_efo(): + """If a future EFO publishes Class 2 and Class 4 on separate rows again, + the legacy candidates pick them up individually.""" + from policyengine_uk_data.targets.sources import obr as obr_module + + with patch.object(obr_module, "load_config", return_value=_FAKE_CONFIG): + targets = obr_module._parse_nics(_build_separate_efo_workbook()) names = {t.name for t in targets} assert names == { "obr/ni_employee", "obr/ni_employer", "obr/ni_class_2", - "obr/ni_class_3", "obr/ni_class_4", } - variables = {t.variable for t in targets} - # Variable names must match the policyengine-uk variable identifiers so - # calibration can map them to simulated totals. - assert variables == { - "ni_employee", - "ni_employer", - "ni_class_2", - "ni_class_3", - "ni_class_4", - } - - # Values are scaled by 1e9 (£bn → £) inside _read_row_values. class_4_target = next(t for t in targets if t.variable == "ni_class_4") assert class_4_target.values[2024] == pytest.approx(4.5e9) +def test_parse_nics_intentionally_skips_class_3_in_combined_efo(): + """In the current combined-line layout, Class 3 is hidden inside + "Other NIC" and the parser must not emit it (#378).""" + from policyengine_uk_data.targets.sources import obr as obr_module + + with patch.object(obr_module, "load_config", return_value=_FAKE_CONFIG): + targets = obr_module._parse_nics(_build_combined_efo_workbook()) + + assert "obr/ni_class_3" not in {t.name for t in targets} + assert "ni_class_3" not in {t.variable for t in targets} + + +def test_parse_nics_intentionally_skips_class_3_in_separate_efo(): + """Even when an EFO publishes a Class 3 row, the parser must not emit + a target for it — the underlying PE-UK variable has no signal (#378).""" + from policyengine_uk_data.targets.sources import obr as obr_module + + with patch.object(obr_module, "load_config", return_value=_FAKE_CONFIG): + targets = obr_module._parse_nics(_build_separate_efo_workbook()) + + assert "obr/ni_class_3" not in {t.name for t in targets} + assert "ni_class_3" not in {t.variable for t in targets} + + def test_parse_nics_tolerates_alt_label_wording(): - """The parser should accept common wording variants for self-employed rows.""" + """Common wording variants (`Self-Employed` vs `self-employed`) match.""" from policyengine_uk_data.targets.sources import obr as obr_module wb = openpyxl.Workbook() ws = wb.active ws.title = "3.4" - - # Use alternative labels that the parser should still find. - rows = [ - ("Class 1 Employee NICs", [100.0] * 7), - ("Class 1 Employer NICs", [110.0] * 7), - ("Class 2 Self-Employed NICs", [0.2] * 7), - ("Class 3 Voluntary NICs", [0.04] * 7), - ("Class 4 Self-Employed NICs", [4.0] * 7), - ] - for row_idx, (label, values) in enumerate(rows, start=2): - ws.cell(row=row_idx, column=2, value=label) - for col_idx, value in enumerate(values, start=3): - ws.cell(row=row_idx, column=col_idx, value=value) - - fake_config = { - "obr": { - "vintage": "test", - "efo_receipts": "https://example.invalid/receipts", - "efo_expenditure": "https://example.invalid/expenditure", - } - } - with patch.object(obr_module, "load_config", return_value=fake_config): + _populate_sheet( + ws, + [ + ("Class 1 Employee NICs", [100.0] * 7), + ("Class 1 Employer NICs", [110.0] * 7), + ("Class 2 Self-Employed NICs", [0.2] * 7), + ("Class 3 Voluntary NICs", [0.04] * 7), + ("Class 4 Self-Employed NICs", [4.0] * 7), + ], + ) + + with patch.object(obr_module, "load_config", return_value=_FAKE_CONFIG): targets = obr_module._parse_nics(wb) assert {t.variable for t in targets} == { "ni_employee", "ni_employer", "ni_class_2", - "ni_class_3", "ni_class_4", } From d2ad5b79b93ec1ef25dc8c22214603803d8ee807 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 1 May 2026 12:32:03 -0400 Subject: [PATCH 2/2] Use direct self-employed NICs target variable --- policyengine_uk_data/targets/sources/obr.py | 17 --------- .../tests/test_obr_nic_signal.py | 38 ++++--------------- policyengine_uk_data/tests/test_obr_nics.py | 14 +++---- 3 files changed, 13 insertions(+), 56 deletions(-) diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py index 693fb988..82b270be 100644 --- a/policyengine_uk_data/targets/sources/obr.py +++ b/policyengine_uk_data/targets/sources/obr.py @@ -323,28 +323,11 @@ def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: "reference_url": ref, "forecast_vintage": vintage, } - # The combined self-employed total has no PE-UK variable of its - # own; sum the two underlying classes via custom_compute so the - # target hits a meaningful matrix column. - if name == "ni_self_employed": - kwargs["custom_compute"] = _compute_ni_self_employed_combined - targets.append(Target(**kwargs)) return targets -def _compute_ni_self_employed_combined(ctx, target, year): # noqa: ARG001 - """Sum Class 2 and Class 4 NICs at the household level. - - Used as the ``custom_compute`` for the combined OBR target whenever - the EFO publishes Class 2 + Class 4 as a single self-employed line. - """ - class_2 = ctx.sim.calculate("ni_class_2") - class_4 = ctx.sim.calculate("ni_class_4") - return ctx.household_from_person(class_2 + class_4) - - def _parse_welfare(wb: openpyxl.Workbook) -> list[Target]: """Parse Table 4.9 (welfare spending) from expenditure xlsx.""" config = load_config() diff --git a/policyengine_uk_data/tests/test_obr_nic_signal.py b/policyengine_uk_data/tests/test_obr_nic_signal.py index 60711234..3ed38666 100644 --- a/policyengine_uk_data/tests/test_obr_nic_signal.py +++ b/policyengine_uk_data/tests/test_obr_nic_signal.py @@ -7,9 +7,8 @@ - ``obr/ni_employee`` — Class 1 employee, formula-derived in PE-UK. - ``obr/ni_employer`` — Class 1 employer, formula-derived in PE-UK. -- ``obr/ni_self_employed`` — combined Class 2 + Class 4, computed via - ``custom_compute`` since recent EFOs publish a single self-employed - line. Sums the two PE-UK variables at the household level. +- ``obr/ni_self_employed`` — combined Class 2 + Class 4, aligned to the + PE-UK ``ni_self_employed`` variable. Class 3 is intentionally absent because no dataset populates ``ni_class_3`` — the matrix column would be a flat zero. @@ -26,7 +25,6 @@ from __future__ import annotations -import numpy as np import pytest @@ -38,6 +36,7 @@ _PE_UK_NIC_VARIABLES_WITH_SIGNAL = ( "ni_employee", "ni_employer", + "ni_self_employed", "ni_class_2", "ni_class_4", ) @@ -55,11 +54,9 @@ def test_obr_nic_target_registry_includes_active_classes(): assert actual == expected, f"Missing OBR NIC targets: {expected - actual}" -def test_obr_ni_self_employed_target_uses_custom_compute(): - """The combined self-employed target is virtual — it must carry a - custom_compute callable, otherwise the loss matrix tries to look up a - non-existent ``ni_self_employed`` PE-UK variable and emits an empty - column.""" +def test_obr_ni_self_employed_target_uses_direct_pe_variable(): + """The combined self-employed target must map directly to the PE-UK + ``ni_self_employed`` variable so target lineage stays explicit.""" from policyengine_uk_data.targets import get_all_targets target = next( @@ -67,9 +64,8 @@ def test_obr_ni_self_employed_target_uses_custom_compute(): None, ) assert target is not None, "obr/ni_self_employed not registered" - assert callable(target.custom_compute), ( - "obr/ni_self_employed must carry a custom_compute (sum of class 2 + 4)" - ) + assert target.variable == "ni_self_employed" + assert target.custom_compute is None def test_obr_ni_class_3_target_is_intentionally_absent(): @@ -102,24 +98,6 @@ def test_active_nic_variable_has_nonzero_variation(enhanced_frs, variable): ) -def test_self_employed_combined_compute_returns_nonzero(enhanced_frs): - """``_compute_ni_self_employed_combined`` must return a non-zero - household-level vector — i.e. the sum of Class 2 and Class 4 actually - has signal in the dataset, not just each component individually.""" - from policyengine_uk import Microsimulation - - sim = Microsimulation(dataset=enhanced_frs) - sim.default_calculation_period = enhanced_frs.time_period - - combined_person = ( - sim.calculate("ni_class_2").values + sim.calculate("ni_class_4").values - ) - combined_household = sim.map_result(combined_person, "person", "household") - - assert (combined_household != 0).sum() > 0 - assert float(np.asarray(combined_household).var()) > 0.0 - - def test_ni_class_3_simulator_returns_uniform_zero(enhanced_frs): """Direct evidence for why Class 3 is excluded: the simulator produces a flat-zero vector, so any calibration target on it is inert. If this diff --git a/policyengine_uk_data/tests/test_obr_nics.py b/policyengine_uk_data/tests/test_obr_nics.py index c1c3ec7b..ae1961a9 100644 --- a/policyengine_uk_data/tests/test_obr_nics.py +++ b/policyengine_uk_data/tests/test_obr_nics.py @@ -15,8 +15,8 @@ column with no signal. The parser therefore drops it. - **Classes 2 + 4** are bundled in recent OBR EFOs (e.g. March 2026) as a single "Class 4 and Class 2 Self employed NICs" line. The - parser emits a combined ``obr/ni_self_employed`` target that uses - ``custom_compute`` to sum the two PE-UK variables. If a future EFO + parser emits a combined ``obr/ni_self_employed`` target that maps + directly to the PE-UK ``ni_self_employed`` variable. If a future EFO reverts to separate rows, the legacy ``ni_class_2`` / ``ni_class_4`` candidates still match and emit individually. """ @@ -89,8 +89,8 @@ def _build_separate_efo_workbook() -> openpyxl.Workbook: def test_parse_nics_combined_self_employed_line(): - """Current EFO format: combined Class 2+4 line gets a single target with - custom_compute, alongside Class 1 employee/employer.""" + """Current EFO format: combined Class 2+4 line gets a single direct + target alongside Class 1 employee/employer.""" from policyengine_uk_data.targets.sources import obr as obr_module with patch.object(obr_module, "load_config", return_value=_FAKE_CONFIG): @@ -106,11 +106,7 @@ def test_parse_nics_combined_self_employed_line(): combined = next(t for t in targets if t.name == "obr/ni_self_employed") assert combined.variable == "ni_self_employed" assert combined.values[2024] == pytest.approx(4.8e9) - # The combined target is virtual — it must carry a custom_compute that - # sums the two underlying PE-UK variables. Without it the loss matrix - # falls through to the simple-GBP path and looks for a non-existent - # `ni_self_employed` PE-UK variable. - assert callable(combined.custom_compute) + assert combined.custom_compute is None def test_parse_nics_falls_back_to_separate_classes_for_old_efo():