diff --git a/changelog/69582.fixed.md b/changelog/69582.fixed.md new file mode 100644 index 000000000000..e41d299937cb --- /dev/null +++ b/changelog/69582.fixed.md @@ -0,0 +1 @@ +Fixed ``salt-run manage.status``/``manage.up``/``manage.down`` reporting every targeted minion as up because synthesized ``no_return`` rows from ``LocalClient.get_cli_event_returns`` were being counted as successful returns. diff --git a/salt/runners/manage.py b/salt/runners/manage.py index 52e54bcdb918..5a4216484432 100644 --- a/salt/runners/manage.py +++ b/salt/runners/manage.py @@ -52,10 +52,21 @@ def _ping(tgt, tgt_type, timeout, gather_job_timeout): tgt, tgt_type, gather_job_timeout=gather_job_timeout, + # ``get_cli_event_returns`` defaults ``expect_minions`` to True so + # the ``salt`` CLI emits per-target timeout rows. The manage + # runner derives ``up`` / ``down`` from actual returns and + # treating the synthesized ``no_return`` rows as returns would + # report every targeted minion as up (issue #69582). + expect_minions=False, ): if fn_ret: - for mid, _ in fn_ret.items(): + for mid, min_ret in fn_ret.items(): + # Defense in depth: drop synthesized timeout rows in case + # a caller passes ``expect_minions=True`` through kwargs + # or a future change re-enables them by default. + if isinstance(min_ret, dict) and min_ret.get("out") == "no_return": + continue log.debug("minion '%s' returned from ping", mid) returned.add(mid) diff --git a/tests/pytests/integration/runners/test_manage_status.py b/tests/pytests/integration/runners/test_manage_status.py new file mode 100644 index 000000000000..7d05f508e8c4 --- /dev/null +++ b/tests/pytests/integration/runners/test_manage_status.py @@ -0,0 +1,155 @@ +""" +Integration regression tests for ``salt-run manage.status`` / +``manage.up`` / ``manage.down``. + +The bug fixed by this module (issue #69582): ``manage._ping`` was +treating the synthesized ``no_return`` rows yielded by +``LocalClient.get_cli_event_returns`` (now emitted by default with +``expect_minions=True``) as successful returns. That made +``manage.status`` report every targeted minion as up and +``manage.down`` return nothing, even when a known-accepted minion was +not running. + +These tests exercise the real runner against live salt-minions plus a +fake accepted-but-absent minion key, so the bug reproduces end to end +without mocks. +""" + +import logging +import os +import shutil + +import pytest + +import salt.utils.path + +log = logging.getLogger(__name__) + +pytestmark = [ + pytest.mark.slow_test, + pytest.mark.windows_whitelisted, +] + + +FAKE_DOWN_MINION_ID = "down-minion-69582" + + +@pytest.fixture +def fake_down_minion(salt_master, salt_minion): + """ + Make the master treat ``down-minion-69582`` as an accepted minion by + cloning ``salt_minion``'s public key under the fake id. No daemon + will respond on that id so any targeting call should time it out -- + which is exactly what ``manage.down`` should surface. + """ + accepted = salt.utils.path.join( + salt_master.config["pki_dir"], "minions", salt_minion.id + ) + fake = salt.utils.path.join( + salt_master.config["pki_dir"], "minions", FAKE_DOWN_MINION_ID + ) + shutil.copyfile(accepted, fake) + try: + yield FAKE_DOWN_MINION_ID + finally: + try: + os.remove(fake) + except FileNotFoundError: + pass + except OSError as exc: + log.error( + "Failed to remove %s, this may affect other tests: %s", + fake, + exc, + ) + + +def _no_return_marker_in(result): + """ + Verify the runner never emitted a synthesized ``no_return`` payload + in a slot that should hold a minion id. This is the original + user-visible symptom of #69582: ``manage.up`` was returning entries + whose ids were actually timeout placeholders. + """ + if isinstance(result, dict): + values = list(result.values()) + keys = list(result.keys()) + else: + values = list(result) + keys = list(result) + for slot in keys + values: + if isinstance(slot, str) and "no_return" in slot.lower(): + return True + if isinstance(slot, dict) and slot.get("out") == "no_return": + return True + return False + + +def test_manage_status_partitions_live_and_down_minions( + salt_run_cli, salt_minion, salt_sub_minion, fake_down_minion +): + """ + ``salt-run manage.status`` must list live minions in ``up`` and the + fake accepted-but-absent minion in ``down``. + + Regression for https://github.com/saltstack/salt/issues/69582 + where the result was ``{"up": [], "down": []}``. + """ + ret = salt_run_cli.run( + "manage.status", "timeout=5", "gather_job_timeout=10", _timeout=120 + ) + assert ret.returncode == 0, ret + data = ret.data + assert isinstance(data, dict), data + assert set(data.keys()) == {"up", "down"}, data + up = data["up"] or [] + down = data["down"] or [] + assert salt_minion.id in up, data + assert salt_sub_minion.id in up, data + assert fake_down_minion in down, data + assert fake_down_minion not in up, data + assert salt_minion.id not in down, data + assert salt_sub_minion.id not in down, data + assert not _no_return_marker_in(up), data + assert not _no_return_marker_in(down), data + + +def test_manage_up_excludes_down_minions( + salt_run_cli, salt_minion, salt_sub_minion, fake_down_minion +): + """ + ``salt-run manage.up`` must omit minions that never returned. + Before the fix the fake accepted-but-absent minion was reported as + up because the synthesized ``no_return`` row was counted as a + successful return. + """ + ret = salt_run_cli.run( + "manage.up", "timeout=5", "gather_job_timeout=10", _timeout=120 + ) + assert ret.returncode == 0, ret + data = ret.data + assert isinstance(data, list), data + assert salt_minion.id in data, data + assert salt_sub_minion.id in data, data + assert fake_down_minion not in data, data + assert not _no_return_marker_in(data), data + + +def test_manage_down_includes_unresponsive_minions( + salt_run_cli, salt_minion, salt_sub_minion, fake_down_minion +): + """ + ``salt-run manage.down`` must list the fake accepted-but-absent + minion. Before the fix it returned an empty list regardless of how + many accepted minions were offline. + """ + ret = salt_run_cli.run( + "manage.down", "timeout=5", "gather_job_timeout=10", _timeout=120 + ) + assert ret.returncode == 0, ret + data = ret.data + assert isinstance(data, list), data + assert fake_down_minion in data, data + assert salt_minion.id not in data, data + assert salt_sub_minion.id not in data, data + assert not _no_return_marker_in(data), data diff --git a/tests/pytests/unit/runners/test_manage_status.py b/tests/pytests/unit/runners/test_manage_status.py new file mode 100644 index 000000000000..3dfb580d553c --- /dev/null +++ b/tests/pytests/unit/runners/test_manage_status.py @@ -0,0 +1,130 @@ +""" +Regression tests for the ``manage.status`` / ``manage.up`` / ``manage.down`` +runner behavior. + +The bug fixed by this module: ``manage._ping`` iterates the events yielded by +``LocalClient.get_cli_event_returns`` and adds every minion id it sees to the +``returned`` set. When ``get_cli_event_returns`` is called with +``expect_minions=True`` it also yields synthesized ``no_return`` entries for +minions that did **not** respond. Treating those synthesized entries as +returns made ``manage.status`` report every minion as up and never list any +minion as down (issue #69582). +""" + +from contextlib import contextmanager + +import pytest + +from salt.runners import manage +from tests.support.mock import MagicMock, patch + + +@pytest.fixture +def configure_loader_modules(): + return { + manage: { + "__opts__": { + "conf_file": "/etc/salt/master", + "timeout": 5, + "gather_job_timeout": 10, + }, + }, + } + + +class _FakeClient: + """Stand-in for ``salt.client.LocalClient`` exposing the surface the + ``manage._ping`` helper uses.""" + + def __init__(self, minions, events): + self._minions = minions + self._events = events + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def run_job(self, *args, **kwargs): # pylint: disable=unused-argument + return {"jid": "20260627000000000000", "minions": list(self._minions)} + + def _get_timeout(self, timeout): + return timeout + + def get_cli_event_returns(self, *args, **kwargs): # pylint: disable=unused-argument + yield from self._events + + +@contextmanager +def _patched_client(fake_client): + @contextmanager + def _factory(*args, **kwargs): # pylint: disable=unused-argument + yield fake_client + + with patch("salt.client.get_local_client", _factory): + yield + + +def test_manage_status_reports_unresponsive_minions_as_down(): + """ + ``manage.status`` must distinguish actual returns from the synthetic + ``no_return`` failure events that ``LocalClient.get_cli_event_returns`` + emits when ``expect_minions`` is True. + + Issue: https://github.com/saltstack/salt/issues/69582 + """ + targeted = ["minion-up", "minion-down"] + events = [ + # Real return from the responsive minion. + {"minion-up": {"ret": True, "retcode": 0, "out": "highstate"}}, + # Synthetic ``no_return`` row emitted by ``get_cli_event_returns`` + # for a minion that did not respond in time. + { + "minion-down": { + "out": "no_return", + "ret": "Minion did not return. [No response]", + "retcode": 2, + }, + }, + ] + fake = _FakeClient(targeted, events) + with _patched_client(fake): + result = manage.status(output=False) + assert result == {"up": ["minion-up"], "down": ["minion-down"]} + + +def test_manage_down_reports_unresponsive_minions(): + """``manage.down`` should include minions that fail to return.""" + targeted = ["minion-up", "minion-down"] + events = [ + {"minion-up": {"ret": True, "retcode": 0, "out": "highstate"}}, + { + "minion-down": { + "out": "no_return", + "ret": "Minion did not return. [No response]", + "retcode": 2, + }, + }, + ] + fake = _FakeClient(targeted, events) + with _patched_client(fake), patch("salt.wheel.Wheel", MagicMock()): + assert manage.down() == ["minion-down"] + + +def test_manage_up_excludes_unresponsive_minions(): + """``manage.up`` should not include minions that fail to return.""" + targeted = ["minion-up", "minion-down"] + events = [ + {"minion-up": {"ret": True, "retcode": 0, "out": "highstate"}}, + { + "minion-down": { + "out": "no_return", + "ret": "Minion did not return. [Not connected]", + "retcode": 2, + }, + }, + ] + fake = _FakeClient(targeted, events) + with _patched_client(fake): + assert manage.up() == ["minion-up"]