From 4b989d3305a1634d17b5b7cba005e4e30cffd53c Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 15:37:57 +0200
Subject: [PATCH 01/85] Remove all new features, just keep no-op changes

---
 esmvalcore/_recipe/check.py                   | 86 ++++++-------------
 esmvalcore/_recipe/recipe.py                  |  4 +-
 esmvalcore/_recipe/to_datasets.py             |  2 +-
 esmvalcore/dataset.py                         | 43 ++++++----
 esmvalcore/local.py                           | 14 +--
 esmvalcore/preprocessor/__init__.py           |  2 +
 esmvalcore/preprocessor/_derive/__init__.py   | 55 +++++++-----
 esmvalcore/preprocessor/_derive/_baseclass.py | 35 ++++----
 esmvalcore/preprocessor/_derive/qep.py        |  4 +-
 esmvalcore/typing.py                          |  2 +-
 10 files changed, 115 insertions(+), 132 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index d937cc9432..50e41d7d21 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -33,6 +33,9 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
+    from esmvalcore.dataset import Dataset
+    from esmvalcore.typing import FacetValue
+
 logger = logging.getLogger(__name__)
 
 
@@ -43,9 +46,7 @@ def ncl_version():
         msg = (
             "Recipe contains NCL scripts, but cannot find an NCL installation."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
     try:
         cmd = [ncl, "-V"]
         version = subprocess.check_output(cmd, universal_newlines=True)
@@ -55,9 +56,7 @@ def ncl_version():
             "Recipe contains NCL scripts, but your NCL "
             "installation appears to be broken."
         )
-        raise RecipeError(
-            msg,
-        ) from exc
+        raise RecipeError(msg) from exc
 
     version = version.strip()
     logger.info("Found NCL version %s", version)
@@ -68,9 +67,7 @@ def ncl_version():
             "NCL version 6.4 or higher is required to run "
             "a recipe containing NCL scripts."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def recipe_with_schema(filename):
@@ -90,9 +87,7 @@ def diagnostics(diags):
     for name, diagnostic in diags.items():
         if "scripts" not in diagnostic:
             msg = f"Missing scripts section in diagnostic '{name}'."
-            raise RecipeError(
-                msg,
-            )
+            raise RecipeError(msg)
         variable_names = tuple(diagnostic.get("variables", {}))
         scripts = diagnostic.get("scripts")
         if scripts is None:
@@ -104,17 +99,13 @@ def diagnostics(diags):
                     f"in diagnostic '{name}': scripts cannot have the "
                     "same name as variables."
                 )
-                raise RecipeError(
-                    msg,
-                )
+                raise RecipeError(msg)
             if not script.get("script"):
                 msg = (
                     f"No script defined for script '{script_name}' in "
                     f"diagnostic '{name}'."
                 )
-                raise RecipeError(
-                    msg,
-                )
+                raise RecipeError(msg)
 
 
 def duplicate_datasets(
@@ -129,9 +120,7 @@ def duplicate_datasets(
             f"groups for variable '{variable_group}' in diagnostic "
             f"'{diagnostic}'."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
     checked_datasets_ = []
     for dataset in datasets:
         if dataset in checked_datasets_:
@@ -139,9 +128,7 @@ def duplicate_datasets(
                 f"Duplicate dataset\n{pformat(dataset)}\nfor variable "
                 f"'{variable_group}' in diagnostic '{diagnostic}'."
             )
-            raise RecipeError(
-                msg,
-            )
+            raise RecipeError(msg)
         checked_datasets_.append(dataset)
 
 
@@ -260,9 +247,7 @@ def preprocessor_supplementaries(dataset, settings):
                     f"one supplementary variable of {ancs['variables']} is "
                     f"defined in the recipe for {dataset}."
                 )
-                raise RecipeError(
-                    msg,
-                )
+                raise RecipeError(msg)
             if ancs["required"] == "prefer_at_least_one":
                 logger.warning(
                     "Preprocessor function %s works best when at least "
@@ -298,9 +283,7 @@ def check_for_temporal_preprocs(profile):
             f"Time coordinate preprocessor step(s) {temp_preprocs} not permitted on fx "
             "vars, please remove them from recipe"
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def extract_shape(settings):
@@ -311,9 +294,7 @@ def extract_shape(settings):
             "In preprocessor function `extract_shape`: "
             f"Unable to find 'shapefile: {shapefile}'"
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
     valid = {
         "method": {"contains", "representative"},
@@ -323,11 +304,12 @@ def extract_shape(settings):
     for key in valid:
         value = settings.get(key)
         if not (value is None or value in valid[key]):
-            raise RecipeError(
+            msg = (
                 f"In preprocessor function `extract_shape`: Invalid value "
                 f"'{value}' for argument '{key}', choose from "
                 "{}".format(", ".join(f"'{k}'".lower() for k in valid[key])),
             )
+            raise RecipeError(msg)
 
 
 def _verify_span_value(span):
@@ -352,9 +334,7 @@ def _verify_groupby(groupby):
             "`multi_model_statistics`.`groupby` must be defined as a "
             f"list. Got {groupby}."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def _verify_keep_input_datasets(keep_input_datasets):
@@ -364,9 +344,7 @@ def _verify_keep_input_datasets(keep_input_datasets):
             f"Must be defined as a boolean (true or false). "
             f"Got {keep_input_datasets}."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def _verify_ignore_scalar_coords(ignore_scalar_coords):
@@ -376,9 +354,7 @@ def _verify_ignore_scalar_coords(ignore_scalar_coords):
             f"Must be defined as a boolean (true or false). Got "
             f"{ignore_scalar_coords}."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def multimodel_statistics_preproc(settings):
@@ -415,9 +391,7 @@ def _check_delimiter(timerange):
             "Valid values must be separated by `/`. "
             f"Got {timerange} instead."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def _check_duration_periods(timerange):
@@ -428,9 +402,7 @@ def _check_duration_periods(timerange):
             "Cannot set both the beginning and the end "
             "as duration periods."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
     if timerange[0].startswith("P"):
         try:
@@ -523,9 +495,7 @@ def _check_literal(
             f"Expected one of {allowed_values} for option `{option}` of "
             f"preprocessor `{step}`, got '{user_value}'"
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 bias_type = partial(
@@ -595,9 +565,7 @@ def _check_ref_attributes(products: set, *, step: str, attr_name: str) -> None:
             f"ensure that the reference dataset is not excluded with the "
             f"'exclude' option"
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 reference_for_bias_preproc = partial(
@@ -708,9 +676,7 @@ def regridding_schemes(settings: dict):
                 f"(see https://docs.esmvaltool.org/projects/ESMValCore/en/"
                 f"latest/recipe/preprocessor.html#generic-regridding-schemes)."
             )
-            raise RecipeError(
-                msg,
-            )
+            raise RecipeError(msg)
 
     # Check generic regridding schemes (given as dict)
     if isinstance(scheme, dict):
@@ -723,6 +689,4 @@ def regridding_schemes(settings: dict):
                 f"/recipe/preprocessor.html#generic-regridding-schemes for "
                 f"details."
             )
-            raise RecipeError(
-                msg,
-            ) from exc
+            raise RecipeError(msg) from exc
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index cb1e1e8275..42b63962ed 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -1005,9 +1005,7 @@ def _resolve_diagnostic_ancestors(self, tasks):
                                 "Could not find any ancestors matching "
                                 f"'{id_glob}'."
                             )
-                            raise RecipeError(
-                                msg,
-                            )
+                            raise RecipeError(msg)
                         logger.debug(
                             "Pattern %s matches %s",
                             id_glob,
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index d2c207224a..08ef4680db 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -216,7 +216,7 @@ def _get_supplementary_short_names(
     var_facets = dict(facets)
     _update_cmor_facets(var_facets)
     realms = var_facets.get("modeling_realm", [])
-    if isinstance(realms, (str, Number)):
+    if isinstance(realms, (str, Number, bool)):
         realms = [str(realms)]
     ocean_realms = {"ocean", "seaIce", "ocnBgchem"}
     is_ocean_variable = any(realm in ocean_realms for realm in realms)
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 050a9e7326..143f59c1a9 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -410,6 +410,16 @@ def _fix_fx_exp(self) -> None:
                         )
                         break
 
+    def _copy(self, **facets: FacetValue) -> Dataset:
+        """Create a copy of the parent dataset without supplementaries."""
+        new = self.__class__()
+        new._session = self._session  # noqa: SLF001
+        for key, value in self.facets.items():
+            new.set_facet(key, deepcopy(value), key in self._persist)
+        for key, value in facets.items():
+            new.set_facet(key, deepcopy(value))
+        return new
+
     def copy(self, **facets: FacetValue) -> Dataset:
         """Create a copy.
 
@@ -425,12 +435,7 @@ def copy(self, **facets: FacetValue) -> Dataset:
         Dataset
             A copy of the dataset.
         """
-        new = self.__class__()
-        new._session = self._session  # noqa: SLF001
-        for key, value in self.facets.items():
-            new.set_facet(key, deepcopy(value), key in self._persist)
-        for key, value in facets.items():
-            new.set_facet(key, deepcopy(value))
+        new = self._copy(**facets)
         for supplementary in self.supplementaries:
             # The short_name and mip of the supplementary variable are probably
             # different from the main variable, so don't copy those facets.
@@ -440,6 +445,7 @@ def copy(self, **facets: FacetValue) -> Dataset:
             }
             new_supplementary = supplementary.copy(**supplementary_facets)
             new.supplementaries.append(new_supplementary)
+
         return new
 
     def __eq__(self, other) -> bool:
@@ -477,8 +483,8 @@ def facets2str(facets):
         if self.supplementaries:
             txt.append("supplementaries:")
             txt.extend(
-                textwrap.indent(facets2str(a.facets), "  ")
-                for a in self.supplementaries
+                textwrap.indent(facets2str(s.facets), "  ")
+                for s in self.supplementaries
             )
         if self._session:
             txt.append(f"session: '{self.session.session_name}'")
@@ -532,10 +538,11 @@ def supplementary_summary(dataset):
             txt += (
                 ", supplementaries: "
                 + "; ".join(
-                    supplementary_summary(a) for a in self.supplementaries
+                    supplementary_summary(s) for s in self.supplementaries
                 )
                 + ""
             )
+
         return txt
 
     def __getitem__(self, key):
@@ -544,7 +551,7 @@ def __getitem__(self, key):
 
     def __setitem__(self, key, value):
         """Set a facet value."""
-        self.facets[key] = value
+        self.set_facet(key, value, persist=False)
 
     def set_facet(self, key: str, value: FacetValue, persist: bool = True):
         """Set facet.
@@ -609,15 +616,19 @@ def add_supplementary(self, **facets: FacetValue) -> None:
         **facets
             Facets describing the supplementary variable.
         """
+        if self.is_derived():
+            facets.setdefault("derive", False)
+        if self.facets.get("force_derivation", False):
+            facets.setdefault("force_derivation", False)
         supplementary = self.copy(**facets)
         supplementary.supplementaries = []
         self.supplementaries.append(supplementary)
 
     def augment_facets(self) -> None:
-        """Add extra facets.
+        """Add additional facets.
 
-        This function will update the dataset with additional facets
-        from various sources.
+        This function will update the dataset with additional facets from
+        various sources.
         """
         self._augment_facets()
         for supplementary in self.supplementaries:
@@ -749,7 +760,7 @@ def _find_files(self) -> None:
                         self.files[idx] = file
 
     @property
-    def files(self) -> Sequence[File]:
+    def files(self) -> list[File]:
         """The files associated with this dataset."""
         if self._files is None:
             self.find_files()
@@ -949,9 +960,7 @@ def _update_timerange(self):
         timerange = self.facets["timerange"]
         if not isinstance(timerange, str):
             msg = f"timerange should be a string, got '{timerange!r}'"
-            raise TypeError(
-                msg,
-            )
+            raise TypeError(msg)
         check.valid_time_selection(timerange)
 
         if "*" in timerange:
diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index 81d2386188..baeeac2757 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -239,15 +239,16 @@ def _replace_years_with_timerange(variable):
     variable.pop("end_year", None)
 
 
-def _parse_period(timerange):
+def _parse_period(timerange: FacetValue) -> tuple[str, str]:
     """Parse `timerange` values given as duration periods.
 
     Sum the duration periods to the `timerange` value given as a
     reference point in order to compute the start and end dates needed
     for file selection.
     """
-    start_date = None
-    end_date = None
+    timerange = str(timerange)
+    start_date: str | None = None
+    end_date: str | None = None
     time_format = None
     datetime_format = (
         isodate.DATE_BAS_COMPLETE + "T" + isodate.TIME_BAS_COMPLETE
@@ -284,8 +285,9 @@ def _parse_period(timerange):
         )
         end_date = str(isodate.date_isoformat(end_date, format=time_format))
 
-    if start_date is None and end_date is None:
+    if start_date is None:
         start_date = timerange.split("/")[0]
+    if end_date is None:
         end_date = timerange.split("/")[1]
 
     return start_date, end_date
@@ -379,9 +381,7 @@ def _replace_tags(
                 f"Dataset key '{tag}' must be specified for {variable}, check "
                 f"your recipe entry and/or extra facet file(s)"
             )
-            raise RecipeError(
-                msg,
-            )
+            raise RecipeError(msg)
         pathset = _replace_tag(pathset, original_tag, replacewith)
     return [Path(p) for p in pathset]
 
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index da5eea2f78..9c80676f7a 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -99,6 +99,8 @@
 
     from dask.delayed import Delayed
 
+    from esmvalcore.dataset import Dataset
+
 logger = logging.getLogger(__name__)
 
 __all__ = [
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index f473956056..cbd181d2c8 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -5,14 +5,17 @@
 from copy import deepcopy
 from pathlib import Path
 
-import iris
+from cf_units import Unit
+from iris.cube import Cube, CubeList
 
+from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase
 from esmvalcore.preprocessor._units import convert_units
+from esmvalcore.typing import Facets, FacetValue
 
 logger = logging.getLogger(__name__)
 
 
-def _get_all_derived_variables():
+def _get_all_derived_variables() -> dict[str, type[DerivedVariableBase]]:
     """Get all possible derived variables.
 
     Returns
@@ -31,55 +34,63 @@ def _get_all_derived_variables():
     return derivers
 
 
-ALL_DERIVED_VARIABLES = _get_all_derived_variables()
+ALL_DERIVED_VARIABLES: dict[str, type[DerivedVariableBase]] = (
+    _get_all_derived_variables()
+)
 
 __all__ = list(ALL_DERIVED_VARIABLES)
 
 
-def get_required(short_name, project):
+def get_required(short_name: FacetValue, project: FacetValue) -> list[Facets]:
     """Return all required variables for derivation.
 
-    Get all information (at least `short_name`) required for derivation.
+    Get all information (at least ``short_name``) required for derivation.
 
     Parameters
     ----------
-    short_name : str
-        `short_name` of the variable to derive.
-    project : str
-        `project` of the variable to derive.
+    short_name:
+        Short name of the variable to derive.
+    project:
+        Project of the variable to derive.
 
     Returns
     -------
-    list
-        List of dictionaries (including at least the key `short_name`).
+    list[esmvalcore.typing.Facets]
+        List of facets (including at least the key ``short_name``).
+
     """
+    short_name = str(short_name)
     if short_name.lower() not in ALL_DERIVED_VARIABLES:
         msg = (
-            f"Cannot derive variable '{short_name}', no derivation script "
+            f"Cannot derive variable '{short_name}': no derivation script "
             f"available"
         )
-        raise NotImplementedError(
-            msg,
-        )
+        raise NotImplementedError(msg)
     DerivedVariable = ALL_DERIVED_VARIABLES[short_name.lower()]  # noqa: N806
     return deepcopy(DerivedVariable().required(project))
 
 
-def derive(cubes, short_name, long_name, units, standard_name=None):
+def derive(
+    cubes: CubeList,
+    short_name: str,
+    long_name: str,
+    units: str | Unit,
+    standard_name: str | None = None,
+) -> Cube:
     """Derive variable.
 
     Parameters
     ----------
-    cubes: iris.cube.CubeList
+    cubes:
         Includes all the needed variables for derivation defined in
         :func:`get_required`.
-    short_name: str
+    short_name:
         short_name
-    long_name: str
+    long_name:
         long_name
-    units: str
+    units:
         units
-    standard_name: str, optional
+    standard_name:
         standard_name
 
     Returns
@@ -90,7 +101,7 @@ def derive(cubes, short_name, long_name, units, standard_name=None):
     if short_name == cubes[0].var_name:
         return cubes[0]
 
-    cubes = iris.cube.CubeList(cubes)
+    cubes = CubeList(cubes)
 
     # Derive variable
     DerivedVariable = ALL_DERIVED_VARIABLES[short_name.lower()]  # noqa: N806
diff --git a/esmvalcore/preprocessor/_derive/_baseclass.py b/esmvalcore/preprocessor/_derive/_baseclass.py
index 2d818f1ca3..4e71f66dd6 100644
--- a/esmvalcore/preprocessor/_derive/_baseclass.py
+++ b/esmvalcore/preprocessor/_derive/_baseclass.py
@@ -2,13 +2,17 @@
 
 from abc import abstractmethod
 
+from iris.cube import Cube, CubeList
+
+from esmvalcore.typing import Facets, FacetValue
+
 
 class DerivedVariableBase:
     """Base class for derived variables."""
 
     @staticmethod
     @abstractmethod
-    def required(project):
+    def required(project: FacetValue) -> list[Facets]:
         """Return required variables for derivation.
 
         This method needs to be overridden in the child class belonging to the
@@ -16,27 +20,27 @@ def required(project):
 
         Note
         ----
-        It is possible to declare a required variable as `optional=True`, which
-        allows the skipping of this particular variable during data extraction.
-        For example, this is useful for fx variables which are often not
-        available for observational datasets. Otherwise, the tool will fail if
-        not all required variables are available for all datasets.
+        It is possible to declare a required variable as ``optional=True``,
+        which allows the skipping of this particular variable during data
+        extraction. For example, this is useful for fx variables which are
+        often not available for observational datasets. Otherwise, the tool
+        will fail if not all required variables are available for all datasets.
 
         Parameters
         ----------
-        project : str
+        project:
             Project of the dataset for which the desired variable is derived.
 
         Returns
         -------
-        list of dict
-            List of variable metadata.
+        list[esmvalcore.typing.Facets]
+            List of facets.
 
         """
 
     @staticmethod
     @abstractmethod
-    def calculate(cubes):
+    def calculate(cubes: CubeList) -> Cube:
         """Compute desired derived variable.
 
         This method needs to be overridden in the child class belonging to the
@@ -44,20 +48,13 @@ def calculate(cubes):
 
         Parameters
         ----------
-        cubes : iris.cube.CubeList
+        cubes:
             Includes all the needed variables (incl. fx variables) for
-            derivation defined in the static class variable
-            `_required_variables`.
+            derivation defined in ``required``.
 
         Returns
         -------
         iris.cube.Cube
             New derived variable.
 
-        Raises
-        ------
-        NotImplementedError
-            If the desired variable derivation is not implemented, i.e. if this
-            method is called from this base class and not a child class.
-
         """
diff --git a/esmvalcore/preprocessor/_derive/qep.py b/esmvalcore/preprocessor/_derive/qep.py
index 3626b5abdf..19d677f618 100644
--- a/esmvalcore/preprocessor/_derive/qep.py
+++ b/esmvalcore/preprocessor/_derive/qep.py
@@ -3,6 +3,8 @@
 from iris import Constraint
 from iris.cube import Cube, CubeList
 
+from esmvalcore.typing import Facets, FacetValue
+
 from ._baseclass import DerivedVariableBase
 
 
@@ -10,7 +12,7 @@ class DerivedVariable(DerivedVariableBase):
     """Derivation of variable `qep`."""
 
     @staticmethod
-    def required(project: str) -> list[dict[str, str]]:
+    def required(project: FacetValue) -> list[Facets]:
         """Declare the variables needed for derivation."""
         return [
             {"short_name": "evspsbl"},
diff --git a/esmvalcore/typing.py b/esmvalcore/typing.py
index 361f886535..7880bdac1b 100644
--- a/esmvalcore/typing.py
+++ b/esmvalcore/typing.py
@@ -9,7 +9,7 @@
 import numpy as np
 from iris.cube import Cube
 
-FacetValue = str | Sequence[str] | Number
+FacetValue = str | Sequence[str] | Number | bool
 """Type describing a single facet."""
 
 Facets = dict[str, FacetValue]

From b0c44f65bd8d8ab9fedfb097c58f281b5bbe1015 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 15:42:15 +0200
Subject: [PATCH 02/85] Further no-op changes

---
 esmvalcore/_recipe/check.py                |  2 --
 esmvalcore/dataset.py                      | 20 ++++++++++++++++++++
 esmvalcore/preprocessor/__init__.py        |  2 --
 esmvalcore/preprocessor/_derive/ohc.py     | 16 ++++++----------
 esmvalcore/preprocessor/_derive/vegfrac.py |  5 ++++-
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index 50e41d7d21..738159bc17 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -33,8 +33,6 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
-    from esmvalcore.dataset import Dataset
-    from esmvalcore.typing import FacetValue
 
 logger = logging.getLogger(__name__)
 
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 143f59c1a9..e1ef99dd20 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -164,6 +164,26 @@ def from_recipe(
 
         return datasets_from_recipe(recipe, session)
 
+    def is_derived(self) -> bool:
+        """Return ``True`` for derived variables, ``False`` otherwise."""
+        return bool(self.facets.get("derive", False))
+
+    def derivation_necessary(self) -> bool:
+        """Return ``True`` if derivation is necessary, ``False`` otherwise."""
+        # If variable cannot be derived, derivation is not necessary
+        if not self.is_derived():
+            return False
+
+        # If forced derivation is requested, derivation is necessary
+        if self.facets.get("force_derivation", False):
+            return True
+
+        # Otherwise, derivation is necessary of no files for the self dataset
+        # are found
+        ds_copy = self.copy()
+        ds_copy.supplementaries = []
+        return not ds_copy.files
+
     def _file_to_dataset(
         self,
         file: esgf.ESGFFile | local.LocalFile,
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 9c80676f7a..da5eea2f78 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -99,8 +99,6 @@
 
     from dask.delayed import Delayed
 
-    from esmvalcore.dataset import Dataset
-
 logger = logging.getLogger(__name__)
 
 __all__ = [
diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py
index ff970e2641..47aa7b2fc8 100644
--- a/esmvalcore/preprocessor/_derive/ohc.py
+++ b/esmvalcore/preprocessor/_derive/ohc.py
@@ -15,16 +15,12 @@ class DerivedVariable(DerivedVariableBase):
     @staticmethod
     def required(project):
         """Declare the variables needed for derivation."""
-        required = [
-            {"short_name": "thetao"},
-            {"short_name": "volcello", "mip": "fx"},
-        ]
-        if project == "CMIP6":
-            required = [
-                {"short_name": "thetao"},
-                {"short_name": "volcello", "mip": "Ofx"},
-            ]
-        return required
+        volcello = {"short_name": "volcello", "mip": "fx"}
+        if project == "CMIP5":
+            volcello["ensemble"] = "r0i0p0"
+        elif project == "CMIP6":
+            volcello["mip"] = "Ofx"
+        return [{"short_name": "thetao"}, volcello]
 
     @staticmethod
     def calculate(cubes):
diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py
index 3cd00a2cb2..c48e723f42 100644
--- a/esmvalcore/preprocessor/_derive/vegfrac.py
+++ b/esmvalcore/preprocessor/_derive/vegfrac.py
@@ -15,10 +15,13 @@ class DerivedVariable(DerivedVariableBase):
     @staticmethod
     def required(project):
         """Declare the variables needed for derivation."""
+        sftlf = {"short_name": "sftlf", "mip": "fx"}
+        if project == "CMIP5":
+            sftlf["ensemble"] = "r0i0p0"
         return [
             {"short_name": "baresoilFrac"},
             {"short_name": "residualFrac"},
-            {"short_name": "sftlf", "mip": "fx"},
+            sftlf,
         ]
 
     @staticmethod

From 1dd5671d043ec687827f9daaad0b0f975bb9c4bd Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 15:44:07 +0200
Subject: [PATCH 03/85] force_derivation=True without derive=True does not make
 sense

---
 esmvalcore/dataset.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index e1ef99dd20..65169e1644 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -136,6 +136,16 @@ def __init__(self, **facets: FacetValue):
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
 
+        if not self.is_derived() and self.facets.get(
+            "force_derivation",
+            False,
+        ):
+            msg = (
+                "Facet `force_derivation=True` can only be used for derived "
+                "variables (i.e., with facet `derive=True`)"
+            )
+            raise ValueError(msg)
+
     @staticmethod
     def from_recipe(
         recipe: Path | str | dict,

From 8989549a9529600e4217b8a9ad862a715563146b Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 15:48:22 +0200
Subject: [PATCH 04/85] Add tests

---
 tests/unit/test_dataset.py | 146 +++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 291ff99fe2..9624f697f6 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2135,3 +2135,149 @@ def test_get_extra_facets_native6():
         "grib_id": "130",
         "tres": "1M",
     }
+
+
+def test_derivation_necessary_no_derivation():
+    dataset = Dataset(
+        project="OBS6",
+        dataset="SAT",
+        mip="Amon",
+        short_name="tas",
+        tier=2,
+        type="sat",
+        timerange="1980/2000",
+    )
+    assert not dataset.derivation_necessary()
+
+
+def test_derivation_necessary_no_force_derivation_no_files():
+    dataset = Dataset(
+        project="OBS6",
+        dataset="SAT",
+        mip="Amon",
+        short_name="asr",
+        tier=2,
+        type="sat",
+        timerange="1980/2000",
+        derive=True,
+    )
+    assert dataset.derivation_necessary()
+
+
+def test_derivation_necessary_no_force_derivation(tmp_path, session):
+    dataset = Dataset(
+        project="OBS6",
+        dataset="SAT",
+        mip="Amon",
+        short_name="asr",
+        tier=2,
+        type="sat",
+        timerange="1980/2000",
+        derive=True,
+    )
+    dataset.session = session
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    asr_file = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
+    )
+    asr_file.touch()
+    assert not dataset.derivation_necessary()
+
+
+def test_derivation_necessary_force_derivation(tmp_path, session):
+    dataset = Dataset(
+        project="CMIP6",
+        dataset="CanESM5",
+        mip="Amon",
+        short_name="lwcre",
+        exp="historical",
+        grid="gn",
+        ensemble="r1i1p1f1",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.session = session
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    asr_file = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
+    )
+    asr_file.touch()
+    assert dataset.derivation_necessary()
+
+
+def test_force_derivation_no_derived():
+    msg = (
+        r"Facet `force_derivation=True` can only be used for derived "
+        r"variables"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        Dataset(
+            project="CMIP6",
+            dataset="CanESM5",
+            mip="Amon",
+            short_name="tas",
+            force_derivation=True,
+        )
+
+    with pytest.raises(ValueError, match=msg):
+        Dataset(
+            project="CMIP6",
+            dataset="CanESM5",
+            mip="Amon",
+            short_name="tas",
+            derive=False,
+            force_derivation=True,
+        )
+
+
+def test_add_supplementary_to_derived():
+    dataset = Dataset(
+        project="CMIP6",
+        dataset="CanESM5",
+        mip="Amon",
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+
+    dataset.add_supplementary(short_name="areacella", mip="fx")
+
+    expected_supplementary = Dataset(
+        project="CMIP6",
+        dataset="CanESM5",
+        mip="fx",
+        short_name="areacella",
+        derive=False,
+        force_derivation=False,
+    )
+    assert dataset.supplementaries[0] == expected_supplementary
+
+
+def test_add_derived_supplementary_to_derived():
+    dataset = Dataset(
+        project="CMIP6",
+        dataset="CanESM5",
+        mip="Amon",
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+
+    dataset.add_supplementary(
+        short_name="asr",
+        derive=True,
+        force_derivation=True,
+    )
+
+    expected_supplementary = Dataset(
+        project="CMIP6",
+        dataset="CanESM5",
+        mip="Amon",
+        short_name="asr",
+        derive=True,
+        force_derivation=True,
+    )
+    assert dataset.supplementaries[0] == expected_supplementary

From 1f6dfa3c9f101bcbe7873588ef0997e1924dac09 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 16:08:17 +0200
Subject: [PATCH 05/85] Add type hints to check.py

---
 esmvalcore/_recipe/check.py       | 95 +++++++++++++++++--------------
 esmvalcore/_recipe/to_datasets.py |  2 +-
 2 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index 738159bc17..dd37a31893 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -31,13 +31,18 @@
 )
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Iterable, Sequence
+    from pathlib import Path
+
+    from esmvalcore._task import TaskSet
+    from esmvalcore.dataset import Dataset
+    from esmvalcore.typing import Facets, FacetValue
 
 
 logger = logging.getLogger(__name__)
 
 
-def ncl_version():
+def ncl_version() -> None:
     """Check the NCL version."""
     ncl = which("ncl")
     if not ncl:
@@ -68,7 +73,7 @@ def ncl_version():
         raise RecipeError(msg)
 
 
-def recipe_with_schema(filename):
+def recipe_with_schema(filename: Path) -> None:
     """Check if the recipe content matches schema."""
     schema_file = os.path.join(os.path.dirname(__file__), "recipe_schema.yml")
     logger.debug("Checking recipe against schema %s", schema_file)
@@ -77,7 +82,7 @@ def recipe_with_schema(filename):
     yamale.validate(schema, recipe, strict=False)
 
 
-def diagnostics(diags):
+def diagnostics(diags: dict[str, dict[str, Any]] | None) -> None:
     """Check diagnostics in recipe."""
     if diags is None:
         msg = "The given recipe does not have any diagnostic."
@@ -149,7 +154,7 @@ def variable(
         )
 
 
-def _log_data_availability_errors(dataset):
+def _log_data_availability_errors(dataset: Dataset) -> None:
     """Check if the required input data is available."""
     input_files = dataset.files
     patterns = dataset._file_globs  # noqa: SLF001
@@ -164,7 +169,7 @@ def _log_data_availability_errors(dataset):
         logger.error("Set 'log_level' to 'debug' to get more information")
 
 
-def _group_years(years):
+def _group_years(years: Iterable[int]) -> str:
     """Group an iterable of years into easy to read text.
 
     Example
@@ -190,7 +195,7 @@ def _group_years(years):
     return ", ".join(ranges)
 
 
-def data_availability(dataset, log=True):
+def data_availability(dataset: Dataset, log: bool = True) -> None:
     """Check if input_files cover the required years."""
     input_files = dataset.files
     facets = dataset.facets
@@ -209,7 +214,7 @@ def data_availability(dataset, log=True):
     start_year = int(start_date[0:4])
     end_year = int(end_date[0:4])
     required_years = set(range(start_year, end_year + 1, 1))
-    available_years = set()
+    available_years: set[int] = set()
 
     for file in input_files:
         start, end = _get_start_end_year(file)
@@ -228,7 +233,10 @@ def data_availability(dataset, log=True):
         )
 
 
-def preprocessor_supplementaries(dataset, settings):
+def preprocessor_supplementaries(
+    dataset: Dataset,
+    settings: dict[str, Any],
+) -> None:
     """Check that the required supplementary variables have been added."""
     steps = [step for step in settings if step in PREPROCESSOR_SUPPLEMENTARIES]
     supplementaries = {d.facets["short_name"] for d in dataset.supplementaries}
@@ -257,7 +265,7 @@ def preprocessor_supplementaries(dataset, settings):
                 )
 
 
-def tasks_valid(tasks):
+def tasks_valid(tasks: TaskSet) -> None:
     """Check that tasks are consistent."""
     filenames = set()
     msg = "Duplicate preprocessor filename {}, please file a bug report."
@@ -269,7 +277,7 @@ def tasks_valid(tasks):
                 filenames.add(product.filename)
 
 
-def check_for_temporal_preprocs(profile):
+def check_for_temporal_preprocs(profile: dict[str, Any]) -> None:
     """Check for temporal operations on fx variables."""
     temp_preprocs = [
         preproc
@@ -284,17 +292,17 @@ def check_for_temporal_preprocs(profile):
         raise RecipeError(msg)
 
 
-def extract_shape(settings):
+def extract_shape(settings: dict[str, Any]) -> None:
     """Check that `extract_shape` arguments are valid."""
     shapefile = settings.get("shapefile", "")
     if not os.path.exists(shapefile):
         msg = (
-            "In preprocessor function `extract_shape`: "
-            f"Unable to find 'shapefile: {shapefile}'"
+            f"In preprocessor function `extract_shape`: Unable to find "
+            f"'shapefile: {shapefile}'"
         )
         raise RecipeError(msg)
 
-    valid = {
+    valid: dict[str, set[Any]] = {
         "method": {"contains", "representative"},
         "crop": {True, False},
         "decomposed": {True, False},
@@ -305,12 +313,12 @@ def extract_shape(settings):
             msg = (
                 f"In preprocessor function `extract_shape`: Invalid value "
                 f"'{value}' for argument '{key}', choose from "
-                "{}".format(", ".join(f"'{k}'".lower() for k in valid[key])),
+                "{}".format(", ".join(f"'{k}'".lower() for k in valid[key]))
             )
             raise RecipeError(msg)
 
 
-def _verify_span_value(span):
+def _verify_span_value(span: str) -> None:
     """Raise error if span argument cannot be verified."""
     valid_names = ("overlap", "full")
     if span not in valid_names:
@@ -319,12 +327,10 @@ def _verify_span_value(span):
             f"`multi_model_statistics`. Valid values are {valid_names}."
             f"Got {span}."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
-def _verify_groupby(groupby):
+def _verify_groupby(groupby: Any) -> None:
     """Raise error if groupby arguments cannot be verified."""
     if not isinstance(groupby, list):
         msg = (
@@ -335,7 +341,7 @@ def _verify_groupby(groupby):
         raise RecipeError(msg)
 
 
-def _verify_keep_input_datasets(keep_input_datasets):
+def _verify_keep_input_datasets(keep_input_datasets: Any) -> None:
     if not isinstance(keep_input_datasets, bool):
         msg = (
             f"Invalid value encountered for `keep_input_datasets`."
@@ -345,7 +351,7 @@ def _verify_keep_input_datasets(keep_input_datasets):
         raise RecipeError(msg)
 
 
-def _verify_ignore_scalar_coords(ignore_scalar_coords):
+def _verify_ignore_scalar_coords(ignore_scalar_coords: Any) -> None:
     if not isinstance(ignore_scalar_coords, bool):
         msg = (
             f"Invalid value encountered for `ignore_scalar_coords`."
@@ -355,13 +361,13 @@ def _verify_ignore_scalar_coords(ignore_scalar_coords):
         raise RecipeError(msg)
 
 
-def multimodel_statistics_preproc(settings):
+def multimodel_statistics_preproc(settings: dict[str, Any]) -> None:
     """Check that the multi-model settings are valid."""
-    span = settings.get("span", None)  # optional, default: overlap
+    span = settings.get("span")  # optional, default: overlap
     if span:
         _verify_span_value(span)
 
-    groupby = settings.get("groupby", None)  # optional, default: None
+    groupby = settings.get("groupby")  # optional, default: None
     if groupby:
         _verify_groupby(groupby)
 
@@ -372,7 +378,7 @@ def multimodel_statistics_preproc(settings):
     _verify_ignore_scalar_coords(ignore_scalar_coords)
 
 
-def ensemble_statistics_preproc(settings):
+def ensemble_statistics_preproc(settings: dict[str, Any]) -> None:
     """Check that the ensemble settings are valid."""
     span = settings.get("span", "overlap")  # optional, default: overlap
     if span:
@@ -382,7 +388,7 @@ def ensemble_statistics_preproc(settings):
     _verify_ignore_scalar_coords(ignore_scalar_coords)
 
 
-def _check_delimiter(timerange):
+def _check_delimiter(timerange: Sequence[str]) -> None:
     if len(timerange) != 2:
         msg = (
             "Invalid value encountered for `timerange`. "
@@ -392,7 +398,7 @@ def _check_delimiter(timerange):
         raise RecipeError(msg)
 
 
-def _check_duration_periods(timerange):
+def _check_duration_periods(timerange: list[str]) -> None:
     # isodate duration must always start with P
     if timerange[0].startswith("P") and timerange[1].startswith("P"):
         msg = (
@@ -422,14 +428,14 @@ def _check_duration_periods(timerange):
             raise RecipeError(msg) from exc
 
 
-def _check_format_years(date):
+def _check_format_years(date: str) -> str:
     if date != "*" and not date.startswith("P"):
         if len(date) < 4:
             date = date.zfill(4)
     return date
 
 
-def _check_timerange_values(date, timerange):
+def _check_timerange_values(date: str, timerange: Iterable[str]) -> None:
     # Wildcards are fine
     if date == "*":
         return
@@ -453,18 +459,21 @@ def _check_timerange_values(date, timerange):
         raise RecipeError(msg) from exc
 
 
-def valid_time_selection(timerange):
+def valid_time_selection(timerange: str) -> None:
     """Check that `timerange` tag is well defined."""
     if timerange != "*":
-        timerange = timerange.split("/")
-        _check_delimiter(timerange)
-        _check_duration_periods(timerange)
-        for date in timerange:
+        timerange_list: list[str] = timerange.split("/")
+        _check_delimiter(timerange_list)
+        _check_duration_periods(timerange_list)
+        for date in timerange_list:
             date = _check_format_years(date)
-            _check_timerange_values(date, timerange)
+            _check_timerange_values(date, timerange_list)
 
 
-def differing_timeranges(timeranges, required_vars):
+def differing_timeranges(
+    timeranges: set[FacetValue],
+    required_vars: list[Facets],
+) -> None:
     """Log error if required variables have differing timeranges."""
     if len(timeranges) > 1:
         msg = (
@@ -472,9 +481,7 @@ def differing_timeranges(timeranges, required_vars):
             f"found for required variables {required_vars}. "
             "Set `timerange` to a common value."
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
 
 def _check_literal(
@@ -596,7 +603,7 @@ def statistics_preprocessors(settings: dict) -> None:
             _check_regular_stat(step, step_settings)
 
 
-def _check_regular_stat(step, step_settings):
+def _check_regular_stat(step: str, step_settings: dict[str, Any]) -> None:
     """Check regular statistics (non-multi-model statistics) step."""
     step_settings = dict(step_settings)
 
@@ -632,7 +639,7 @@ def _check_regular_stat(step, step_settings):
         raise RecipeError(msg) from exc
 
 
-def _check_mm_stat(step, step_settings):
+def _check_mm_stat(step: str, step_settings: dict[str, Any]) -> None:
     """Check multi-model statistic step."""
     statistics = step_settings.get("statistics", [])
     for stat in statistics:
@@ -647,7 +654,7 @@ def _check_mm_stat(step, step_settings):
             raise RecipeError(msg) from exc
 
 
-def regridding_schemes(settings: dict):
+def regridding_schemes(settings: dict[str, Any]) -> None:
     """Check :obj:`str` regridding schemes."""
     if "regrid" not in settings:
         return
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 08ef4680db..ee332e8927 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -581,7 +581,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
             datasets.append(input_dataset)
 
     # Check timeranges of available input data.
-    timeranges = set()
+    timeranges: set[FacetValue] = set()
     for input_dataset in datasets:
         if "timerange" in input_dataset.facets:
             timeranges.add(input_dataset.facets["timerange"])

From b6a6651ff4c15b346ba064736de1e52bd51d9567 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 16:40:08 +0200
Subject: [PATCH 06/85] Added type hints for recipe.py

---
 esmvalcore/_recipe/check.py         |   8 +-
 esmvalcore/_recipe/recipe.py        | 233 ++++++++++++++++++----------
 esmvalcore/preprocessor/__init__.py |   6 +-
 3 files changed, 152 insertions(+), 95 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index dd37a31893..e79db4f2ee 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -149,9 +149,7 @@ def variable(
             f"Missing keys {missing} in\n{pformat(var)}\nfor variable "
             f"'{variable_group}' in diagnostic '{diagnostic}'."
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
 
 
 def _log_data_availability_errors(dataset: Dataset) -> None:
@@ -228,9 +226,7 @@ def data_availability(dataset: Dataset, log: bool = True) -> None:
             missing_txt,
             "\n".join(str(f) for f in input_files),
         )
-        raise InputFilesNotFound(
-            msg,
-        )
+        raise InputFilesNotFound(msg)
 
 
 def preprocessor_supplementaries(
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 42b63962ed..0c61307c0b 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -16,7 +16,7 @@
 
 from esmvalcore import __version__, esgf
 from esmvalcore._provenance import get_recipe_provenance
-from esmvalcore._task import DiagnosticTask, ResumeTask, TaskSet
+from esmvalcore._task import BaseTask, DiagnosticTask, ResumeTask, TaskSet
 from esmvalcore.config._config import TASKSEP
 from esmvalcore.config._dask import validate_dask_config
 from esmvalcore.config._diagnostics import TAGS
@@ -57,7 +57,10 @@
 )
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Sequence
+    from collections.abc import Iterable
+
+    from esmvalcore.config import Session
+    from esmvalcore.typing import Facets
 
 logger = logging.getLogger(__name__)
 
@@ -70,7 +73,7 @@
 """Use a global variable to keep track of datasets that are actually used."""
 
 
-def read_recipe_file(filename: Path, session):
+def read_recipe_file(filename: Path, session: Session) -> Recipe:
     """Read a recipe from file."""
     check.recipe_with_schema(filename)
     with open(filename, encoding="utf-8") as file:
@@ -79,7 +82,7 @@ def read_recipe_file(filename: Path, session):
     return Recipe(raw_recipe, session, recipe_file=filename)
 
 
-def _special_name_to_dataset(facets, special_name):
+def _special_name_to_dataset(facets: Facets, special_name: str) -> str:
     """Convert special names to dataset names."""
     if special_name in ("reference_dataset", "alternative_dataset"):
         if special_name not in facets:
@@ -93,15 +96,17 @@ def _special_name_to_dataset(facets, special_name):
                     diagnostic=facets["diagnostic"],
                 )
             )
-            raise RecipeError(
-                msg,
-            )
-        special_name = facets[special_name]
+            raise RecipeError(msg)
+        dataset_name = str(facets[special_name])
 
-    return special_name
+    return dataset_name
 
 
-def _update_target_levels(dataset, datasets, settings):
+def _update_target_levels(
+    dataset: Dataset,
+    datasets: list[Dataset],
+    settings: dict[str, Any],
+) -> None:
     """Replace the target levels dataset name with a filename if needed."""
     levels = settings.get("extract_levels", {}).get("levels")
     if not levels:
@@ -135,7 +140,11 @@ def _update_target_levels(dataset, datasets, settings):
             )
 
 
-def _update_target_grid(dataset, datasets, settings):
+def _update_target_grid(
+    dataset: Dataset,
+    datasets: list[Dataset],
+    settings: dict[str, Any],
+) -> None:
     """Replace the target grid dataset name with a filename if needed."""
     grid = settings.get("regrid", {}).get("target_grid")
     if not grid:
@@ -169,7 +178,7 @@ def _update_regrid_time(dataset: Dataset, settings: dict) -> None:
         settings["regrid_time"]["frequency"] = dataset.facets["frequency"]
 
 
-def _select_dataset(dataset_name, datasets):
+def _select_dataset(dataset_name: str, datasets: list[Dataset]) -> Dataset:
     for dataset in datasets:
         if dataset.facets["dataset"] == dataset_name:
             return dataset
@@ -179,12 +188,13 @@ def _select_dataset(dataset_name, datasets):
         f"Unable to find dataset '{dataset_name}' in the list of datasets"
         f"for variable '{variable_group}' of diagnostic '{diagnostic}'."
     )
-    raise RecipeError(
-        msg,
-    )
+    raise RecipeError(msg)
 
 
-def _limit_datasets(datasets, profile):
+def _limit_datasets(
+    datasets: list[Dataset],
+    profile: dict[str, Any],
+) -> list[Dataset]:
     """Try to limit the number of datasets to max_datasets."""
     max_datasets = datasets[0].session["max_datasets"]
     if not max_datasets:
@@ -208,13 +218,13 @@ def _limit_datasets(datasets, profile):
 
     logger.info(
         "Only considering %s",
-        ", ".join(d.facets["alias"] for d in limited),
+        ", ".join(str(d.facets["alias"]) for d in limited),
     )
 
     return limited
 
 
-def _get_default_settings(dataset):
+def _get_default_settings(dataset: Dataset) -> dict[str, Any]:
     """Get default preprocessor settings."""
     session = dataset.session
     facets = dataset.facets
@@ -243,7 +253,10 @@ def _get_default_settings(dataset):
     return settings
 
 
-def _add_dataset_specific_settings(dataset: Dataset, settings: dict) -> None:
+def _add_dataset_specific_settings(
+    dataset: Dataset,
+    settings: dict[str, Any],
+) -> None:
     """Add dataset-specific settings."""
     project = dataset.facets["project"]
     dataset_name = dataset.facets["dataset"]
@@ -271,7 +284,11 @@ def _add_dataset_specific_settings(dataset: Dataset, settings: dict) -> None:
         )
 
 
-def _exclude_dataset(settings, facets, step):
+def _exclude_dataset(
+    settings: dict[str, Any],
+    facets: Facets,
+    step: str,
+) -> None:
     """Exclude dataset from specific preprocessor step if requested."""
     exclude = {
         _special_name_to_dataset(facets, dataset)
@@ -286,14 +303,17 @@ def _exclude_dataset(settings, facets, step):
         )
 
 
-def _update_weighting_settings(settings, facets):
+def _update_weighting_settings(
+    settings: dict[str, Any],
+    facets: Facets,
+) -> None:
     """Update settings for the weighting preprocessors."""
     if "weighting_landsea_fraction" not in settings:
         return
     _exclude_dataset(settings, facets, "weighting_landsea_fraction")
 
 
-def _add_to_download_list(dataset):
+def _add_to_download_list(dataset: Dataset) -> None:
     """Add the files of `dataset` to `DOWNLOAD_FILES`."""
     for i, file in enumerate(dataset.files):
         if isinstance(file, esgf.ESGFFile):
@@ -301,7 +321,7 @@ def _add_to_download_list(dataset):
             dataset.files[i] = file.local_file(dataset.session["download_dir"])
 
 
-def _schedule_for_download(datasets):
+def _schedule_for_download(datasets: list[Dataset]) -> None:
     """Schedule files for download."""
     for dataset in datasets:
         _add_to_download_list(dataset)
@@ -354,14 +374,16 @@ def _check_input_files(input_datasets: Iterable[Dataset]) -> set[str]:
     return missing
 
 
-def _apply_preprocessor_profile(settings, profile_settings):
+def _apply_preprocessor_profile(
+    settings: dict[str, Any],
+    profile_settings: dict[str, Any],
+) -> None:
     """Apply settings from preprocessor profile."""
     profile_settings = deepcopy(profile_settings)
     for step, args in profile_settings.items():
         # Remove disabled preprocessor functions
         if args is False:
-            if step in settings:
-                del settings[step]
+            settings.pop(step, None)
             continue
         # Enable/update functions without keywords
         if step not in settings:
@@ -370,9 +392,12 @@ def _apply_preprocessor_profile(settings, profile_settings):
             settings[step].update(args)
 
 
-def _get_common_attributes(products, settings):
+def _get_common_attributes(
+    products: set[PreprocessorFile],
+    settings: dict[str, Any],
+) -> dict[str, Any]:
     """Get common attributes for the output products."""
-    attributes = {}
+    attributes: dict[str, Any] = {}
     some_product = next(iter(products))
     for key, value in some_product.attributes.items():
         if all(p.attributes.get(key, object()) == value for p in products):
@@ -420,7 +445,11 @@ def _get_common_attributes(products, settings):
     return attributes
 
 
-def _get_downstream_settings(step, order, products):
+def _get_downstream_settings(
+    step: str,
+    order: tuple[str, ...],
+    products: set[PreprocessorFile],
+) -> dict[str, Any]:
     """Get downstream preprocessor settings shared between products."""
     settings = {}
     remaining_steps = order[order.index(step) + 1 :]
@@ -434,7 +463,10 @@ def _get_downstream_settings(step, order, products):
     return settings
 
 
-def _update_multi_dataset_settings(facets, settings):
+def _update_multi_dataset_settings(
+    facets: Facets,
+    settings: dict[str, Any],
+) -> None:
     """Configure multi dataset statistics."""
     for step in MULTI_MODEL_FUNCTIONS:
         if not settings.get(step):
@@ -443,7 +475,7 @@ def _update_multi_dataset_settings(facets, settings):
         _exclude_dataset(settings, facets, step)
 
 
-def _get_tag(step, identifier, statistic):
+def _get_tag(step: str, identifier: str, statistic: str) -> str:
     # Avoid . in filename for percentiles
     statistic = statistic.replace(".", "-")
 
@@ -457,7 +489,12 @@ def _get_tag(step, identifier, statistic):
     return tag
 
 
-def _update_multiproduct(input_products, order, preproc_dir, step):
+def _update_multiproduct(
+    input_products: set[PreprocessorFile],
+    order: tuple[str, ...],
+    preproc_dir: Path,
+    step: str,
+) -> tuple[set[PreprocessorFile], dict[str, Any]]:
     """Return new products that are aggregated over multiple datasets.
 
     These new products will replace the original products at runtime.
@@ -483,7 +520,7 @@ def _update_multiproduct(input_products, order, preproc_dir, step):
 
     downstream_settings = _get_downstream_settings(step, order, multiproducts)
 
-    relevant_settings = {
+    relevant_settings: dict[str, Any] = {
         "output_products": defaultdict(dict),
     }  # pass to ancestors
 
@@ -524,7 +561,11 @@ def _update_multiproduct(input_products, order, preproc_dir, step):
     return output_products, relevant_settings
 
 
-def update_ancestors(ancestors, step, downstream_settings):
+def update_ancestors(
+    ancestors: set[PreprocessorFile],
+    step: str,
+    downstream_settings: dict[str, Any],
+) -> None:
     """Retroactively add settings to ancestor products."""
     for product in ancestors:
         if step in product.settings:
@@ -533,7 +574,7 @@ def update_ancestors(ancestors, step, downstream_settings):
                 settings[key] = value
 
 
-def _update_extract_shape(settings, session):
+def _update_extract_shape(settings: dict[str, Any], session: Session) -> None:
     if "extract_shape" in settings:
         shapefile = settings["extract_shape"].get("shapefile")
         if shapefile:
@@ -542,7 +583,7 @@ def _update_extract_shape(settings, session):
         check.extract_shape(settings["extract_shape"])
 
 
-def _allow_skipping(dataset: Dataset):
+def _allow_skipping(dataset: Dataset) -> bool:
     """Allow skipping of datasets."""
     return all(
         [
@@ -553,7 +594,7 @@ def _allow_skipping(dataset: Dataset):
     )
 
 
-def _set_version(dataset: Dataset, input_datasets: list[Dataset]):
+def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
     """Set the 'version' facet based on derivation input datasets."""
     versions = set()
     for in_dataset in input_datasets:
@@ -573,7 +614,7 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]):
 def _get_preprocessor_products(
     datasets: list[Dataset],
     profile: dict[str, Any],
-    order: list[str],
+    order: tuple[str, ...],
     name: str,
 ) -> set[PreprocessorFile]:
     """Get preprocessor product definitions for a set of datasets.
@@ -629,9 +670,7 @@ def _get_preprocessor_products(
             f"Missing data for preprocessor {name}:{separator}"
             f"{separator.join(sorted(missing_vars))}"
         )
-        raise InputFilesNotFound(
-            msg,
-        )
+        raise InputFilesNotFound(msg)
 
     check.reference_for_bias_preproc(products)
     check.reference_for_distance_metric_preproc(products)
@@ -651,11 +690,11 @@ def _get_preprocessor_products(
 
 
 def _configure_multi_product_preprocessor(
-    products: Iterable[PreprocessorFile],
+    products: set[PreprocessorFile],
     preproc_dir: Path,
     profile: PreprocessorSettings,
-    order: Sequence[str],
-):
+    order: tuple[str, ...],
+) -> None:
     """Configure preprocessing of ensemble and multimodel statistics."""
     ensemble_step = "ensemble_statistics"
     multi_model_step = "multi_model_statistics"
@@ -718,7 +757,12 @@ def _set_start_end_year(product: PreprocessorFile) -> None:
         product.attributes["end_year"] = int(str(end_year[0:4]))
 
 
-def _update_preproc_functions(settings, dataset, datasets, missing_vars):
+def _update_preproc_functions(
+    settings: dict[str, Any],
+    dataset: Dataset,
+    datasets: list[Dataset],
+    missing_vars: set[str],
+) -> None:
     session = dataset.session
     _update_extract_shape(settings, session)
     _update_weighting_settings(settings, dataset.facets)
@@ -748,20 +792,22 @@ def _update_preproc_functions(settings, dataset, datasets, missing_vars):
     check.resample_hours(settings)
 
 
-def _get_preprocessor_task(datasets, profiles, task_name):
+def _get_preprocessor_task(
+    datasets: list[Dataset],
+    profiles: dict[str, Any],
+    task_name: str,
+) -> PreprocessingTask:
     """Create preprocessor task(s) for a set of datasets."""
     # First set up the preprocessor profile
     facets = datasets[0].facets
     session = datasets[0].session
-    preprocessor = facets.get("preprocessor", "default")
+    preprocessor = str(facets.get("preprocessor", "default"))
     if preprocessor not in profiles:
         msg = (
             f"Unknown preprocessor '{preprocessor}' in variable "
             f"{facets['variable_group']} of diagnostic {facets['diagnostic']}"
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
     logger.info(
         "Creating preprocessor '%s' task for variable '%s'",
         preprocessor,
@@ -800,7 +846,7 @@ def _get_preprocessor_task(datasets, profiles, task_name):
     return task
 
 
-def _extract_preprocessor_order(profile):
+def _extract_preprocessor_order(profile: dict[str, Any]) -> tuple[str, ...]:
     """Extract the order of the preprocessing steps from the profile."""
     custom_order = profile.pop("custom_order", False)
     if not custom_order:
@@ -816,7 +862,12 @@ def _extract_preprocessor_order(profile):
 class Recipe:
     """Recipe object."""
 
-    def __init__(self, raw_recipe, session, recipe_file: Path):
+    def __init__(
+        self,
+        raw_recipe: dict[str, Any],
+        session: Session,
+        recipe_file: Path,
+    ) -> None:
         """Parse a recipe file into an object."""
         validate_dask_config(session["dask"])
 
@@ -846,7 +897,7 @@ def __init__(self, raw_recipe, session, recipe_file: Path):
             self._log_recipe_errors(exc)
             raise
 
-    def _log_recipe_errors(self, exc):
+    def _log_recipe_errors(self, exc: RecipeError) -> None:
         """Log a message with recipe errors."""
         logger.error(exc.message)
         for task in exc.failed_tasks:
@@ -880,7 +931,7 @@ def _log_recipe_errors(self, exc):
             )
 
     @staticmethod
-    def _need_ncl(raw_diagnostics):
+    def _need_ncl(raw_diagnostics: dict[str, Any]) -> bool:
         if not raw_diagnostics:
             return False
         for diagnostic in raw_diagnostics.values():
@@ -893,7 +944,7 @@ def _need_ncl(raw_diagnostics):
                     return True
         return False
 
-    def _initialize_provenance(self, raw_documentation):
+    def _initialize_provenance(self, raw_documentation: dict[str, Any]):
         """Initialize the recipe provenance."""
         doc = deepcopy(raw_documentation)
 
@@ -901,7 +952,10 @@ def _initialize_provenance(self, raw_documentation):
 
         return get_recipe_provenance(doc, self._filename)
 
-    def _initialize_diagnostics(self, raw_diagnostics):
+    def _initialize_diagnostics(
+        self,
+        raw_diagnostics: dict[str, Any],
+    ) -> dict[str, Any]:
         """Define diagnostics in recipe."""
         logger.debug("Retrieving diagnostics from recipe")
         check.diagnostics(raw_diagnostics)
@@ -909,7 +963,7 @@ def _initialize_diagnostics(self, raw_diagnostics):
         diagnostics = {}
 
         for name, raw_diagnostic in raw_diagnostics.items():
-            diagnostic = {}
+            diagnostic: dict[str, Any] = {}
             diagnostic["name"] = name
             diagnostic["datasets"] = [
                 ds for ds in self.datasets if ds.facets["diagnostic"] == name
@@ -930,10 +984,10 @@ def _initialize_diagnostics(self, raw_diagnostics):
 
     def _initialize_scripts(
         self,
-        diagnostic_name,
-        raw_scripts,
-        variable_names,
-    ):
+        diagnostic_name: str,
+        raw_scripts: dict[str, Any],
+        variable_names: tuple[str, Any],
+    ) -> dict[str, Any]:
         """Define script in diagnostic."""
         if not raw_scripts:
             return {}
@@ -982,7 +1036,10 @@ def _initialize_scripts(
 
         return scripts
 
-    def _resolve_diagnostic_ancestors(self, tasks):
+    def _resolve_diagnostic_ancestors(
+        self,
+        tasks: Iterable[PreprocessingTask],
+    ) -> None:
         """Resolve diagnostic ancestors."""
         tasks = {t.name: t for t in tasks}
         for diagnostic_name, diagnostic in self.diagnostics.items():
@@ -997,7 +1054,7 @@ def _resolve_diagnostic_ancestors(self, tasks):
                         diagnostic_name,
                         script_name,
                     )
-                    ancestors = []
+                    ancestors: list[BaseTask] = []
                     for id_glob in script_cfg["ancestors"]:
                         ancestor_ids = fnmatch.filter(tasks, id_glob)
                         if not ancestor_ids:
@@ -1014,7 +1071,7 @@ def _resolve_diagnostic_ancestors(self, tasks):
                         ancestors.extend(tasks[a] for a in ancestor_ids)
                     tasks[task_id].ancestors = ancestors
 
-    def _get_tasks_to_run(self):
+    def _get_tasks_to_run(self) -> set[str]:
         """Get tasks filtered and add ancestors if needed."""
         tasknames_to_run = self.session["diagnostics"]
         if tasknames_to_run:
@@ -1023,7 +1080,7 @@ def _get_tasks_to_run(self):
                 pass
         return tasknames_to_run
 
-    def _update_with_ancestors(self, tasknames_to_run):
+    def _update_with_ancestors(self, tasknames_to_run: set[str]) -> bool:
         """Add ancestors for all selected tasks."""
         num_filters = len(tasknames_to_run)
 
@@ -1055,12 +1112,12 @@ def _update_with_ancestors(self, tasknames_to_run):
 
     def _create_diagnostic_tasks(
         self,
-        diagnostic_name,
-        diagnostic,
-        tasknames_to_run,
-    ):
+        diagnostic_name: str,
+        diagnostic: dict[str, Any],
+        tasknames_to_run: set[str],
+    ) -> list[BaseTask]:
         """Create diagnostic tasks."""
-        tasks = []
+        tasks: list[BaseTask] = []
 
         if self.session["run_diagnostic"]:
             for script_name, script_cfg in diagnostic["scripts"].items():
@@ -1091,14 +1148,14 @@ def _create_diagnostic_tasks(
 
     def _create_preprocessor_tasks(
         self,
-        diagnostic_name,
-        diagnostic,
-        tasknames_to_run,
-        any_diag_script_is_run,
-    ):
+        diagnostic_name: str,
+        diagnostic: dict[str, Any],
+        tasknames_to_run: set[str],
+        any_diag_script_is_run: bool,
+    ) -> tuple[list[BaseTask], list[RecipeError]]:
         """Create preprocessor tasks."""
-        tasks = []
-        failed_tasks = []
+        tasks: list[BaseTask] = []
+        failed_tasks: list[RecipeError] = []
         for variable_group, datasets in groupby(
             diagnostic["datasets"],
             key=lambda ds: ds.facets["variable_group"],
@@ -1138,7 +1195,11 @@ def _create_preprocessor_tasks(
                         diagnostic_name,
                         variable_group,
                     )
-                    task = ResumeTask(prev_preproc_dir, preproc_dir, task_name)
+                    task: BaseTask = ResumeTask(
+                        prev_preproc_dir,
+                        preproc_dir,
+                        task_name,
+                    )
                     tasks.append(task)
                     break
             else:
@@ -1156,7 +1217,7 @@ def _create_preprocessor_tasks(
 
         return tasks, failed_tasks
 
-    def _create_tasks(self):
+    def _create_tasks(self) -> TaskSet:
         """Create tasks from the recipe."""
         logger.info("Creating tasks from recipe")
         tasks = TaskSet()
@@ -1208,7 +1269,7 @@ def _create_tasks(self):
 
         return tasks
 
-    def initialize_tasks(self):
+    def initialize_tasks(self) -> TaskSet:
         """Define tasks in recipe."""
         tasks = self._create_tasks()
         tasks = tasks.flatten()
@@ -1227,11 +1288,11 @@ def initialize_tasks(self):
         # Return smallest possible set of tasks
         return tasks.get_independent()
 
-    def __str__(self):
+    def __str__(self) -> str:
         """Get human readable summary."""
         return "\n\n".join(str(task) for task in self.tasks)
 
-    def run(self):
+    def run(self) -> None:
         """Run all tasks in the recipe."""
         if not self.tasks:
             msg = "No tasks to run!"
@@ -1249,7 +1310,7 @@ def run(self):
         )
         self.write_html_summary()
 
-    def get_output(self) -> dict:
+    def get_output(self) -> dict[str, Any]:
         """Return the paths to the output plots and data.
 
         Returns
@@ -1257,7 +1318,7 @@ def get_output(self) -> dict:
         product_filenames : dict
             Lists of products/attributes grouped by task.
         """
-        output = {}
+        output: dict[str, Any] = {}
 
         output["session"] = self.session
         output["recipe_filename"] = self._filename
@@ -1275,7 +1336,7 @@ def get_output(self) -> dict:
 
         return output
 
-    def write_filled_recipe(self):
+    def write_filled_recipe(self) -> Path:
         """Write copy of recipe with filled wildcards."""
         recipe = datasets_to_recipe(USED_DATASETS, self._raw_recipe)
         filename = self.session.run_dir / f"{self._filename.stem}_filled.yml"
@@ -1287,7 +1348,7 @@ def write_filled_recipe(self):
         )
         return filename
 
-    def write_html_summary(self):
+    def write_html_summary(self) -> None:
         """Write summary html file to the output dir."""
         with warnings.catch_warnings():
             # ignore import warnings
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index da5eea2f78..8732e11f83 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -223,14 +223,14 @@
 """
 
 # The order of initial and final steps cannot be configured
-INITIAL_STEPS = DEFAULT_ORDER[
+INITIAL_STEPS: tuple[str, ...] = DEFAULT_ORDER[
     : DEFAULT_ORDER.index("add_supplementary_variables") + 1
 ]
-FINAL_STEPS = DEFAULT_ORDER[
+FINAL_STEPS: tuple[str, ...] = DEFAULT_ORDER[
     DEFAULT_ORDER.index("remove_supplementary_variables") :
 ]
 
-MULTI_MODEL_FUNCTIONS = {
+MULTI_MODEL_FUNCTIONS: set[str] = {
     "bias",
     "distance_metric",
     "ensemble_statistics",

From 6793e0c6e71bfa95e281383eb2d9a8d42f1bb5ad Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 16:56:34 +0200
Subject: [PATCH 07/85] Added type hints for to_datasets.py

---
 esmvalcore/_recipe/to_datasets.py | 39 +++++++++++++++++--------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index ee332e8927..7a78ac439f 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -30,7 +30,7 @@
 
 logger = logging.getLogger(__name__)
 
-_ALIAS_INFO_KEYS = (
+_ALIAS_INFO_KEYS: tuple[str, ...] = (
     "project",
     "activity",
     "driver",
@@ -43,7 +43,7 @@
 """List of keys to be used to compose the alias, ordered by priority."""
 
 
-def _facet_to_str(facet_value: FacetValue) -> str:
+def _facet_to_str(facet_value: FacetValue | None) -> str:
     """Get a string representation of a facet value."""
     if isinstance(facet_value, str):
         return facet_value
@@ -52,7 +52,7 @@ def _facet_to_str(facet_value: FacetValue) -> str:
     return str(facet_value)
 
 
-def _set_alias(variables):
+def _set_alias(variables: list[list[Dataset]]) -> None:
     """Add unique alias for datasets.
 
     Generates a unique alias for each dataset that will be shared by all
@@ -99,41 +99,46 @@ def _set_alias(variables):
     variables : list
         for each recipe variable, a list of datasets
     """
-    datasets_info = set()
+    datasets_info: set[tuple[str, ...]] = set()
 
     for variable in variables:
         for dataset in variable:
-            alias = tuple(
+            alias_tuple = tuple(
                 _facet_to_str(dataset.facets.get(key, None))
                 for key in _ALIAS_INFO_KEYS
             )
-            datasets_info.add(alias)
+            datasets_info.add(alias_tuple)
             if "alias" not in dataset.facets:
-                dataset.facets["alias"] = alias
+                dataset.facets["alias"] = alias_tuple
 
-    alias = {}
+    alias: dict[tuple[str, ...], list[Any]] = {}
     for info in datasets_info:
         alias[info] = []
 
-    datasets_info = list(datasets_info)
-    _get_next_alias(alias, datasets_info, 0)
+    datasets_info_list: list[tuple[str, ...]] = list(datasets_info)
+    _get_next_alias(alias, datasets_info_list, 0)
 
-    for info in datasets_info:
-        alias[info] = "_".join(
+    final_alias: dict[tuple[str, ...], str] = {}
+    for info in datasets_info_list:
+        final_alias[info] = "_".join(
             [str(value) for value in alias[info] if value is not None],
         )
-        if not alias[info]:
-            alias[info] = info[_ALIAS_INFO_KEYS.index("dataset")]
+        if not final_alias[info]:
+            final_alias[info] = info[_ALIAS_INFO_KEYS.index("dataset")]
 
     for variable in variables:
         for dataset in variable:
-            dataset.facets["alias"] = alias.get(
+            dataset.facets["alias"] = final_alias.get(  # type: ignore
                 dataset.facets["alias"],
                 dataset.facets["alias"],
             )
 
 
-def _get_next_alias(alias, datasets_info, i):
+def _get_next_alias(
+    alias: dict[tuple[str, ...], list[Any]],
+    datasets_info: list[tuple[str, ...]],
+    i: int,
+) -> None:
     if i >= len(_ALIAS_INFO_KEYS):
         return
     key_values = {info[i] for info in datasets_info}
@@ -185,7 +190,7 @@ def _merge_supplementary_dicts(
     return list(merged.values())
 
 
-def _fix_cmip5_fx_ensemble(dataset: Dataset):
+def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
     """Automatically correct the wrong ensemble for CMIP5 fx variables."""
     if (
         dataset.facets.get("project") == "CMIP5"

From 878e3104cf87c7dc4adde57ad2ade4ce92d2f291 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 17:08:40 +0200
Subject: [PATCH 08/85] Added type hints for dataset.py

---
 esmvalcore/_recipe/to_datasets.py |  4 +--
 esmvalcore/dataset.py             | 46 +++++++++++++++++++------------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 7a78ac439f..7cd17bdbb0 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -164,9 +164,7 @@ def _check_supplementaries_valid(supplementaries: Iterable[Facets]) -> None:
                 "'short_name' is required for supplementary_variables "
                 f"entries, but missing in {facets}"
             )
-            raise RecipeError(
-                msg,
-            )
+            raise RecipeError(msg)
 
 
 def _merge_supplementary_dicts(
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 65169e1644..d471dde5d8 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -67,7 +67,7 @@
 """
 
 
-def _augment(base: dict, update: dict):
+def _augment(base: dict, update: dict) -> None:
     """Update dict `base` with values from dict `update`."""
     for key in update:
         if key not in base:
@@ -108,7 +108,7 @@ class Dataset:
         Facets describing the dataset.
     """
 
-    _SUMMARY_FACETS = (
+    _SUMMARY_FACETS: tuple[str, ...] = (
         "short_name",
         "mip",
         "project",
@@ -124,7 +124,7 @@ class Dataset:
     )
     """Facets used to create a summary of a Dataset instance."""
 
-    def __init__(self, **facets: FacetValue):
+    def __init__(self, **facets: FacetValue) -> None:
         self.facets: Facets = {}
         self.supplementaries: list[Dataset] = []
 
@@ -478,7 +478,7 @@ def copy(self, **facets: FacetValue) -> Dataset:
 
         return new
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: object) -> bool:
         """Compare with another dataset."""
         return (
             isinstance(other, self.__class__)
@@ -575,15 +575,20 @@ def supplementary_summary(dataset):
 
         return txt
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: Any) -> FacetValue:
         """Get a facet value."""
         return self.facets[key]
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: str, value: FacetValue) -> None:
         """Set a facet value."""
         self.set_facet(key, value, persist=False)
 
-    def set_facet(self, key: str, value: FacetValue, persist: bool = True):
+    def set_facet(
+        self,
+        key: str,
+        value: FacetValue,
+        persist: bool = True,
+    ) -> None:
         """Set facet.
 
         Parameters
@@ -665,9 +670,16 @@ def augment_facets(self) -> None:
             supplementary._augment_facets()  # noqa: SLF001
 
     @staticmethod
-    def _pattern_filter(patterns: Iterable[str], name: str) -> list[str]:
+    def _pattern_filter(
+        patterns: Iterable[FacetValue],
+        name: FacetValue,
+    ) -> list[str]:
         """Get the subset of the list `patterns` that `name` matches."""
-        return [pat for pat in patterns if fnmatch.fnmatchcase(name, pat)]
+        return [
+            str(pat)
+            for pat in patterns
+            if fnmatch.fnmatchcase(str(name), str(pat))
+        ]
 
     def _get_extra_facets(self) -> dict[str, Any]:
         """Get extra facets of dataset."""
@@ -718,7 +730,7 @@ def _get_extra_facets(self) -> dict[str, Any]:
 
         return extra_facets
 
-    def _augment_facets(self):
+    def _augment_facets(self) -> None:
         extra_facets = self._get_extra_facets()
         _augment(self.facets, extra_facets)
         if "institute" not in self.facets:
@@ -797,7 +809,7 @@ def files(self) -> list[File]:
         return self._files  # type: ignore
 
     @files.setter
-    def files(self, value):
+    def files(self, value: Sequence[File]) -> None:
         self._files = value
 
     def load(self) -> Cube:
@@ -939,15 +951,15 @@ def from_ranges(self) -> list[Dataset]:
                 ]
         return datasets
 
-    def _expand_range(self, input_tag):
+    def _expand_range(self, input_tag: str) -> list[FacetValue]:
         """Expand ranges such as ensemble members or start dates.
 
         Expansion only supports ensembles defined as strings, not lists.
         """
-        expanded = []
+        expanded: list[FacetValue] = []
         regex = re.compile(r"\(\d+:\d+\)")
 
-        def expand_range(input_range):
+        def expand_range(input_range) -> None:
             match = regex.search(input_range)
             if match:
                 start, end = match.group(0)[1:-1].split(":")
@@ -965,16 +977,14 @@ def expand_range(input_range):
                         f"In {self}: {input_tag} expansion "
                         f"cannot be combined with {input_tag} lists"
                     )
-                    raise RecipeError(
-                        msg,
-                    )
+                    raise RecipeError(msg)
             expanded.append(tag)
         else:
             expand_range(tag)
 
         return expanded
 
-    def _update_timerange(self):
+    def _update_timerange(self) -> None:
         """Update wildcards in timerange with found datetime values.
 
         If the timerange is given as a year, it ensures it's formatted

From be6e55d2e7b9577b7bd6b3ccd109357a836dd794 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 17:23:47 +0200
Subject: [PATCH 09/85] Add type hints to local.py

---
 esmvalcore/_recipe/recipe.py |  4 +++
 esmvalcore/local.py          | 62 ++++++++++++++++++++++--------------
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 0c61307c0b..e14d9201e2 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -412,10 +412,14 @@ def _get_common_attributes(
         if "timerange" not in product.attributes:
             continue
         timerange = product.attributes["timerange"]
+        start: int | str
+        end: int | str
         start, end = _parse_period(timerange)
         if "timerange" not in attributes:
             attributes["timerange"] = _dates_to_timerange(start, end)
         else:
+            start_date: int | str
+            end_date: int | str
             start_date, end_date = _parse_period(attributes["timerange"])
             start_date, start = _truncate_dates(start_date, start)
             end_date, end = _truncate_dates(end_date, end)
diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index baeeac2757..df0a0df225 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -20,17 +20,25 @@
 from .exceptions import RecipeError
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from .esgf import ESGFFile
     from .typing import Facets, FacetValue
 
 logger = logging.getLogger(__name__)
 
 
-def _get_from_pattern(pattern, date_range_pattern, stem, group):
+def _get_from_pattern(
+    pattern: str,
+    date_range_pattern: str,
+    stem: str,
+    group: str,
+) -> tuple[str | None, str | None]:
     """Get time, date or datetime from date range patterns in file names."""
     # Next string allows to test that there is an allowed delimiter (or
     # string start or end) close to date range (or to single date)
-    start_point = end_point = None
+    start_point: str | None = None
+    end_point: str | None = None
     context = r"(?:^|[-_]|$)"
 
     # First check for a block of two potential dates
@@ -170,9 +178,7 @@ def _get_start_end_date(
             f"File {file} datetimes do not match a recognized pattern and "
             f"time coordinate can not be read from the file"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     # Remove potential '-' characters from datetimes
     start_date = start_date.replace("-", "")
@@ -192,7 +198,7 @@ def _get_start_end_year(
     return (int(start_date[:4]), int(end_date[:4]))
 
 
-def _dates_to_timerange(start_date, end_date):
+def _dates_to_timerange(start_date: int | str, end_date: int | str) -> str:
     """Convert ``start_date`` and ``end_date`` to ``timerange``.
 
     Note
@@ -203,9 +209,9 @@ def _dates_to_timerange(start_date, end_date):
 
     Parameters
     ----------
-    start_date: int or str
+    start_date:
         Start date.
-    end_date: int or str
+    end_date:
         End date.
 
     Returns
@@ -225,7 +231,7 @@ def _dates_to_timerange(start_date, end_date):
     return f"{start_date}/{end_date}"
 
 
-def _replace_years_with_timerange(variable):
+def _replace_years_with_timerange(variable: dict[str, Any]) -> None:
     """Set `timerange` tag from tags `start_year` and `end_year`."""
     start_year = variable.get("start_year")
     end_year = variable.get("end_year")
@@ -293,7 +299,7 @@ def _parse_period(timerange: FacetValue) -> tuple[str, str]:
     return start_date, end_date
 
 
-def _truncate_dates(date, file_date):
+def _truncate_dates(date: str, file_date: str) -> tuple[int, int]:
     """Truncate dates of different lengths and convert to integers.
 
     This allows to compare the dates chronologically. For example, this allows
@@ -317,7 +323,10 @@ def _truncate_dates(date, file_date):
     return int(date), int(file_date)
 
 
-def _select_files(filenames, timerange):
+def _select_files(
+    filenames: list[LocalFile],
+    timerange: list,
+) -> list[LocalFile]:
     """Select files containing data between a given timerange.
 
     If the timerange is given as a period, the file selection occurs
@@ -333,6 +342,10 @@ def _select_files(filenames, timerange):
     selection = []
 
     for filename in filenames:
+        start: int | str
+        end: int | str
+        start_date: int | str
+        end_date: int | str
         start_date, end_date = _parse_period(timerange)
         start, end = _get_start_end_date(filename)
 
@@ -349,6 +362,7 @@ def _replace_tags(
     variable: Facets,
 ) -> list[Path]:
     """Replace tags in the config-developer's file with actual values."""
+    pathset: Iterable[str]
     if isinstance(paths, str):
         pathset = {paths.strip("/")}
     else:
@@ -386,10 +400,14 @@ def _replace_tags(
     return [Path(p) for p in pathset]
 
 
-def _replace_tag(paths, tag, replacewith):
+def _replace_tag(
+    paths: Iterable[str],
+    tag: str,
+    replacewith: FacetValue,
+) -> list[str]:
     """Replace tag by replacewith in paths."""
     _, lower, upper = _get_caps_options(tag)
-    result = []
+    result: list[str] = []
     if isinstance(replacewith, (list, tuple)):
         for item in replacewith:
             result.extend(_replace_tag(paths, tag, item))
@@ -399,7 +417,7 @@ def _replace_tag(paths, tag, replacewith):
     return list(set(result))
 
 
-def _get_caps_options(tag):
+def _get_caps_options(tag: str) -> tuple[str, bool, bool]:
     lower = False
     upper = False
     if tag.endswith(".lower"):
@@ -411,7 +429,7 @@ def _get_caps_options(tag):
     return tag, lower, upper
 
 
-def _apply_caps(original, lower, upper):
+def _apply_caps(original: str, lower: bool, upper: bool) -> str:
     if lower:
         return original.lower()
     if upper:
@@ -433,9 +451,7 @@ def _select_drs(input_type: str, project: str, structure: str) -> list[str]:
         return value
 
     msg = f"drs {structure} for {project} project not specified in config-developer file"
-    raise KeyError(
-        msg,
-    )
+    raise KeyError(msg)
 
 
 @dataclass(order=True)
@@ -470,7 +486,7 @@ def find_files(self, **facets) -> list[LocalFile]:
         globs = self.get_glob_patterns(**facets)
         logger.debug("Looking for files matching %s", globs)
 
-        files = []
+        files: list[LocalFile] = []
         for glob_ in globs:
             for filename in glob(str(glob_)):
                 file = LocalFile(filename)
@@ -579,7 +595,7 @@ def _templates_to_regex(self) -> str:
         return pattern
 
 
-_ROOTPATH_WARNED = set()
+_ROOTPATH_WARNED: set[tuple[str, tuple[str]]] = set()
 
 
 def _get_data_sources(project: str) -> list[DataSource]:
@@ -615,9 +631,7 @@ def _get_data_sources(project: str) -> list[DataSource]:
         f"No '{project}' or 'default' path specified under 'rootpath' in "
         "the configuration."
     )
-    raise KeyError(
-        msg,
-    )
+    raise KeyError(msg)
 
 
 def _get_output_file(variable: dict[str, Any], preproc_dir: Path) -> Path:
@@ -831,5 +845,5 @@ def facets(self) -> Facets:
         return self._facets
 
     @facets.setter
-    def facets(self, value: Facets):
+    def facets(self, value: Facets) -> None:
         self._facets = value

From b1caf65eb63f75bd41bf4605b672863b69ffd2f5 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 17:42:37 +0200
Subject: [PATCH 10/85] Add type hints to preprocessor/__init__.py

---
 esmvalcore/preprocessor/__init__.py | 125 +++++++++++++++-------------
 1 file changed, 68 insertions(+), 57 deletions(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 8732e11f83..bbc8313127 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -9,7 +9,7 @@
 from pprint import pformat
 from typing import TYPE_CHECKING, Any
 
-from iris.cube import Cube
+from iris.cube import Cube, CubeList
 
 from esmvalcore._provenance import TrackedFile
 from esmvalcore._task import BaseTask
@@ -95,10 +95,12 @@
 from ._weighting import weighting_landsea_fraction
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Callable, Iterable
 
     from dask.delayed import Delayed
 
+    from esmvalcore.dataset import Dataset, File
+
 logger = logging.getLogger(__name__)
 
 __all__ = [
@@ -202,7 +204,7 @@
     "save",
 ]
 
-TIME_PREPROCESSORS = [
+TIME_PREPROCESSORS: list[str] = [
     "clip_timerange",
     "extract_time",
     "extract_season",
@@ -217,7 +219,7 @@
     "regrid_time",
 ]
 
-DEFAULT_ORDER = tuple(__all__)
+DEFAULT_ORDER: tuple[str, ...] = tuple(__all__)
 """
 By default, preprocessor functions are applied in this order.
 """
@@ -240,13 +242,13 @@
 }
 
 
-def _get_itype(step):
+def _get_itype(step: str) -> str:
     """Get the input type of a preprocessor function."""
     function = globals()[step]
     return next(iter(inspect.signature(function).parameters))
 
 
-def check_preprocessor_settings(settings):
+def check_preprocessor_settings(settings: dict[str, Any]) -> None:
     """Check preprocessor settings."""
     for step in settings:
         if step not in DEFAULT_ORDER:
@@ -254,9 +256,7 @@ def check_preprocessor_settings(settings):
                 f"Unknown preprocessor function '{step}', choose from: "
                 f"{', '.join(DEFAULT_ORDER)}"
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
 
         function = globals()[step]
 
@@ -292,9 +292,7 @@ def check_preprocessor_settings(settings):
                     f"encountered for preprocessor function {step}. \n"
                     f"Valid arguments are: [{', '.join(args)}]"
                 )
-                raise ValueError(
-                    msg,
-                )
+                raise ValueError(msg)
 
         # Check for missing arguments
         defaults = [
@@ -309,9 +307,7 @@ def check_preprocessor_settings(settings):
                 f"Missing required argument(s) {missing_args} for "
                 f"preprocessor function {step}"
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
 
         # Final sanity check in case the above fails to catch a mistake
         try:
@@ -324,7 +320,7 @@ def check_preprocessor_settings(settings):
             raise
 
 
-def _check_multi_model_settings(products):
+def _check_multi_model_settings(products: Iterable[PreprocessorFile]) -> None:
     """Check that multi dataset settings are identical for all products."""
     multi_model_steps = (
         step
@@ -345,16 +341,17 @@ def _check_multi_model_settings(products):
                     f"{reference.filename} and {product.filename}, "
                     f"{reference.settings[step]} and {settings}"
                 )
-                raise ValueError(
-                    msg,
-                )
+                raise ValueError(msg)
 
 
-def _get_multi_model_settings(products, step):
+def _get_multi_model_settings(
+    products: set[PreprocessorFile],
+    step: str,
+) -> tuple[dict[str, Any], set[PreprocessorFile]]:
     """Select settings for multi model step."""
     _check_multi_model_settings(products)
     settings = {}
-    exclude = set()
+    exclude: set[PreprocessorFile] = set()
     for product in products:
         if step in product.settings:
             settings = product.settings[step]
@@ -363,7 +360,12 @@ def _get_multi_model_settings(products, step):
     return settings, exclude
 
 
-def _run_preproc_function(function, items, kwargs, input_files=None):
+def _run_preproc_function(
+    function: Callable,
+    items: Any,
+    kwargs: Any,
+    input_files: list[File] | None = None,
+) -> Any:
     """Run preprocessor function."""
     kwargs_str = ",\n".join(
         [f"{k} = {pformat(v)}" for (k, v) in kwargs.items()],
@@ -424,13 +426,13 @@ def _run_preproc_function(function, items, kwargs, input_files=None):
 
 
 def preprocess(
-    items,
-    step,
-    input_files=None,
-    output_file=None,
-    debug=False,
-    **settings,
-):
+    items: list[PreprocessorFile | Cube | str | Path],
+    step: str,
+    input_files: list[File] | None = None,
+    output_file: Path | None = None,
+    debug: bool = False,
+    **settings: Any,
+) -> list[PreprocessorFile | Cube | str | Path]:
     """Run preprocessor."""
     logger.debug("Running preprocessor step %s", step)
     function = globals()[step]
@@ -480,15 +482,18 @@ def preprocess(
     return items
 
 
-def get_step_blocks(steps, order):
+def get_step_blocks(
+    steps: Iterable[str],
+    order: list[str],
+) -> list[list[str]]:
     """Group steps into execution blocks."""
-    blocks = []
+    blocks: list[list[str]] = []
     prev_step_type = None
     for step in order[len(INITIAL_STEPS) : -len(FINAL_STEPS)]:
         if step in steps:
             step_type = step in MULTI_MODEL_FUNCTIONS
             if step_type is not prev_step_type:
-                block = []
+                block: list[str] = []
                 blocks.append(block)
             prev_step_type = step_type
             block.append(step)
@@ -504,7 +509,7 @@ def __init__(
         attributes: dict[str, Any] | None = None,
         settings: dict[str, Any] | None = None,
         datasets: list | None = None,
-    ):
+    ) -> None:
         if datasets is not None:
             # Load data using a Dataset
             input_files = []
@@ -519,8 +524,8 @@ def __init__(
             input_files = []
             ancestors = []
 
-        self.datasets = datasets
-        self._cubes = None
+        self.datasets: list[Dataset] | None = datasets
+        self._cubes: CubeList | None = None
         self._input_files = input_files
 
         # Set some preprocessor settings (move all defaults here?)
@@ -542,17 +547,15 @@ def __init__(
             ancestors=ancestors,
         )
 
-    def check(self):
+    def check(self) -> None:
         """Check preprocessor settings."""
         check_preprocessor_settings(self.settings)
 
-    def apply(self, step: str, debug: bool = False):
+    def apply(self, step: str, debug: bool = False) -> None:
         """Apply preprocessor step to product."""
         if step not in self.settings:
             msg = f"PreprocessorFile {self} has no settings for step {step}"
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
         self.cubes = preprocess(
             self.cubes,
             step,
@@ -563,20 +566,20 @@ def apply(self, step: str, debug: bool = False):
         )
 
     @property
-    def cubes(self):
+    def cubes(self) -> CubeList:
         """Cubes."""
         if self._cubes is None:
-            self._cubes = [ds.load() for ds in self.datasets]
+            self._cubes = [ds.load() for ds in self.datasets]  # type: ignore
         return self._cubes
 
     @cubes.setter
-    def cubes(self, value):
+    def cubes(self, value: CubeList) -> None:
         self._cubes = value
 
     def save(self) -> Delayed | None:
         """Save cubes to disk."""
         return preprocess(
-            self._cubes,
+            self._cubes,  # type: ignore
             "save",
             input_files=self._input_files,
             **self.settings["save"],
@@ -592,7 +595,7 @@ def close(self) -> Delayed | None:
             self.save_provenance()
         return result
 
-    def _update_attributes(self):
+    def _update_attributes(self) -> None:
         """Update product attributes from cube metadata."""
         if not self._cubes:
             return
@@ -616,11 +619,11 @@ def _update_attributes(self):
             self.attributes["frequency"] = ref_cube.attributes["frequency"]
 
     @property
-    def is_closed(self):
+    def is_closed(self) -> bool:
         """Check if the file is closed."""
         return self._cubes is None
 
-    def _initialize_entity(self):
+    def _initialize_entity(self) -> None:
         """Initialize the provenance entity representing the file."""
         super()._initialize_entity()
         settings = {
@@ -651,7 +654,11 @@ def group(self, keys: list) -> str:
         return "_".join(identifier)
 
 
-def _apply_multimodel(products, step, debug):
+def _apply_multimodel(
+    products: set[PreprocessorFile],
+    step: str,
+    debug: bool | None,
+) -> set[PreprocessorFile]:
     """Apply multi model step to products."""
     settings, exclude = _get_multi_model_settings(products, step)
 
@@ -660,7 +667,11 @@ def _apply_multimodel(products, step, debug):
         step,
         "\n".join(str(p) for p in products - exclude),
     )
-    result = preprocess(products - exclude, step, **settings)
+    result: list[PreprocessorFile] = preprocess(  # type: ignore
+        products - exclude,  # type: ignore
+        step,
+        **settings,
+    )
     products = set(result) | exclude
 
     if debug:
@@ -683,7 +694,7 @@ def __init__(
         order: Iterable[str] = DEFAULT_ORDER,
         debug: bool | None = None,
         write_ncl_interface: bool = False,
-    ):
+    ) -> None:
         """Initialize."""
         _check_multi_model_settings(products)
         super().__init__(name=name, products=products)
@@ -691,13 +702,13 @@ def __init__(
         self.debug = debug
         self.write_ncl_interface = write_ncl_interface
 
-    def _initialize_product_provenance(self):
+    def _initialize_product_provenance(self) -> None:
         """Initialize product provenance."""
         self._initialize_products(self.products)
         self._initialize_multimodel_provenance()
         self._initialize_ensemble_provenance()
 
-    def _initialize_multiproduct_provenance(self, step):
+    def _initialize_multiproduct_provenance(self, step: str) -> None:
         input_products = self._get_input_products(step)
         if input_products:
             statistic_products = set()
@@ -711,23 +722,23 @@ def _initialize_multiproduct_provenance(self, step):
 
             self._initialize_products(statistic_products)
 
-    def _initialize_multimodel_provenance(self):
+    def _initialize_multimodel_provenance(self) -> None:
         """Initialize provenance for multi-model statistics."""
         step = "multi_model_statistics"
         self._initialize_multiproduct_provenance(step)
 
-    def _initialize_ensemble_provenance(self):
+    def _initialize_ensemble_provenance(self) -> None:
         """Initialize provenance for ensemble statistics."""
         step = "ensemble_statistics"
         self._initialize_multiproduct_provenance(step)
 
-    def _get_input_products(self, step):
+    def _get_input_products(self, step: str) -> list[PreprocessorFile]:
         """Get input products."""
         return [
             product for product in self.products if step in product.settings
         ]
 
-    def _initialize_products(self, products):
+    def _initialize_products(self, products: set[PreprocessorFile]) -> None:
         """Initialize products."""
         for product in products:
             product.initialize_provenance(self.activity)
@@ -792,7 +803,7 @@ def _run(self, _) -> list[str]:  # noqa: C901,PLR0912
             self.write_ncl_interface,
         )
 
-    def __str__(self):
+    def __str__(self) -> str:
         """Get human readable description."""
         order = [
             step

From 19dbff94c72349a31492b350fe68bd9c21c97e02 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 17:45:39 +0200
Subject: [PATCH 11/85] Add type hints to compare_with_refs.py

---
 esmvalcore/preprocessor/_compare_with_refs.py | 34 +++++++------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py
index e29cda36d0..d60f700d0b 100644
--- a/esmvalcore/preprocessor/_compare_with_refs.py
+++ b/esmvalcore/preprocessor/_compare_with_refs.py
@@ -119,9 +119,7 @@ def bias(
                 "A list of Cubes is given to this preprocessor; please "
                 "specify a `reference`"
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
         (reference, ref_product) = _get_ref(products, "reference_for_bias")
     else:
         ref_product = None
@@ -178,9 +176,7 @@ def _get_ref(products, ref_tag: str) -> tuple[Cube, PreprocessorFile]:
             f"Expected exactly 1 dataset with '{ref_tag}: true', found "
             f"{len(ref_products):d}"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
     ref_product = ref_products[0]
 
     # Extract reference cube
@@ -210,9 +206,7 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube:
             f"Expected one of ['absolute', 'relative'] for bias_type, got "
             f"'{bias_type}'"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     cube.metadata = cube_metadata
     cube.units = new_units
@@ -345,9 +339,7 @@ def distance_metric(
                 "A list of Cubes is given to this preprocessor; please "
                 "specify a `reference`"
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
         reference, reference_product = _get_ref(
             products,
             "reference_for_metric",
@@ -403,18 +395,14 @@ def _calculate_metric(
             f"distance metric calculation, got {cube.shape} and "
             f"{reference.shape}, respectively"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
     try:
         cube + reference  # dummy operation to check if cubes are compatible
     except Exception as exc:
         msg = (
             "Cannot calculate distance metric between cube and reference cube "
         )
-        raise ValueError(
-            msg,
-        ) from exc
+        raise ValueError(msg) from exc
 
     # Perform the actual calculation of the distance metric
     # Note: we work on arrays here instead of cube to stay as flexible as
@@ -436,9 +424,7 @@ def _calculate_metric(
         msg = (
             f"Expected one of {list(metrics_funcs)} for metric, got '{metric}'"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
     (res_data, res_metadata) = metrics_funcs[metric](cube, reference, coords)
 
     # Get result cube with correct dimensional metadata by using dummy
@@ -589,7 +575,11 @@ def _calculate_emd(
     return (emd, metadata)
 
 
-def _get_emd(arr, ref_arr, bin_centers):
+def _get_emd(
+    arr: np.ndarray,
+    ref_arr: np.ndarray,
+    bin_centers: np.ndarray,
+) -> np.ndarray:
     """Calculate Earth mover's distance (non-lazy)."""
     if np.ma.is_masked(arr) or np.ma.is_masked(ref_arr):
         return np.ma.masked  # this is safe because PMFs will be masked arrays

From d8ea7d9a9be3856b723e4a1502a1bb8960d2b488 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 17:46:13 +0200
Subject: [PATCH 12/85] Add type hints to _derive/__init__.py

---
 esmvalcore/preprocessor/_derive/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index cbd181d2c8..cd209f88de 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -144,8 +144,6 @@ def derive(
                 f"Units '{cube.units}' after executing derivation script of "
                 f"'{short_name}' cannot be converted to target units '{units}'"
             )
-            raise ValueError(
-                msg,
-            ) from exc
+            raise ValueError(msg) from exc
 
     return cube

From 367bfe70672a99236bf1b01c5f849b5854275d9d Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 17:49:11 +0200
Subject: [PATCH 13/85] Add type hints to some derive functions

---
 esmvalcore/preprocessor/_derive/ohc.py     | 9 ++++++---
 esmvalcore/preprocessor/_derive/vegfrac.py | 8 +++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py
index 47aa7b2fc8..87643d2d1b 100644
--- a/esmvalcore/preprocessor/_derive/ohc.py
+++ b/esmvalcore/preprocessor/_derive/ohc.py
@@ -3,6 +3,9 @@
 import iris
 from cf_units import Unit
 from iris import Constraint
+from iris.cube import Cube, CubeList
+
+from esmvalcore.typing import Facets, FacetValue
 
 from ._baseclass import DerivedVariableBase
 
@@ -13,9 +16,9 @@ class DerivedVariable(DerivedVariableBase):
     """Derivation of variable `ohc`."""
 
     @staticmethod
-    def required(project):
+    def required(project: FacetValue) -> list[Facets]:
         """Declare the variables needed for derivation."""
-        volcello = {"short_name": "volcello", "mip": "fx"}
+        volcello: Facets = {"short_name": "volcello", "mip": "fx"}
         if project == "CMIP5":
             volcello["ensemble"] = "r0i0p0"
         elif project == "CMIP6":
@@ -23,7 +26,7 @@ def required(project):
         return [{"short_name": "thetao"}, volcello]
 
     @staticmethod
-    def calculate(cubes):
+    def calculate(cubes: CubeList) -> Cube:
         """
         Compute ocean heat content.
 
diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py
index c48e723f42..edd4dce75d 100644
--- a/esmvalcore/preprocessor/_derive/vegfrac.py
+++ b/esmvalcore/preprocessor/_derive/vegfrac.py
@@ -3,8 +3,10 @@
 import dask.array as da
 import iris
 from iris import NameConstraint
+from iris.cube import Cube, CubeList
 
 from esmvalcore.preprocessor._regrid import regrid
+from esmvalcore.typing import Facets, FacetValue
 
 from ._baseclass import DerivedVariableBase
 
@@ -13,9 +15,9 @@ class DerivedVariable(DerivedVariableBase):
     """Derivation of variable `vegFrac`."""
 
     @staticmethod
-    def required(project):
+    def required(project: FacetValue) -> list[Facets]:
         """Declare the variables needed for derivation."""
-        sftlf = {"short_name": "sftlf", "mip": "fx"}
+        sftlf: Facets = {"short_name": "sftlf", "mip": "fx"}
         if project == "CMIP5":
             sftlf["ensemble"] = "r0i0p0"
         return [
@@ -25,7 +27,7 @@ def required(project):
         ]
 
     @staticmethod
-    def calculate(cubes):
+    def calculate(cubes: CubeList) -> Cube:
         """Compute vegetation fraction from bare soil fraction."""
         baresoilfrac_cube = cubes.extract_cube(
             NameConstraint(var_name="baresoilFrac"),

From 5bbe6ce4b5a6f1584435da339c56840471b9d239 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 18:16:57 +0200
Subject: [PATCH 14/85] Add type hints to _regrid.py

---
 esmvalcore/preprocessor/_regrid.py | 216 ++++++++++++++---------------
 1 file changed, 108 insertions(+), 108 deletions(-)

diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py
index 0489ad881f..62b1b27e83 100644
--- a/esmvalcore/preprocessor/_regrid.py
+++ b/esmvalcore/preprocessor/_regrid.py
@@ -50,6 +50,9 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
+    from iris.coords import Coord
+    from numpy.typing import ArrayLike
+
     from esmvalcore.dataset import Dataset
 
 logger = logging.getLogger(__name__)
@@ -117,7 +120,7 @@
 }
 
 # Supported vertical interpolation schemes.
-VERTICAL_SCHEMES = (
+VERTICAL_SCHEMES: tuple[str, ...] = (
     "linear",
     "nearest",
     "linear_extrapolate",
@@ -125,12 +128,12 @@
 )
 
 
-def parse_cell_spec(spec):
+def parse_cell_spec(spec: str) -> tuple[float, float]:
     """Parse an MxN cell specification string.
 
     Parameters
     ----------
-    spec: str
+    spec:
         ``MxN`` degree cell-specification for the global grid.
 
     Returns
@@ -171,16 +174,20 @@ def parse_cell_spec(spec):
     return dlon, dlat
 
 
-def _generate_cube_from_dimcoords(latdata, londata, circular: bool = False):
+def _generate_cube_from_dimcoords(
+    latdata: ArrayLike,
+    londata: ArrayLike,
+    circular: bool = False,
+) -> Cube:
     """Generate cube from lat/lon points.
 
     Parameters
     ----------
-    latdata : np.ndarray
+    latdata:
         List of latitudes.
-    londata : np.ndarray
+    londata:
         List of longitudes.
-    circular : bool
+    circular
         Wrap longitudes around the full great circle. Bounds will not be
         generated for circular coordinates.
 
@@ -217,7 +224,11 @@ def _generate_cube_from_dimcoords(latdata, londata, circular: bool = False):
 
 
 @functools.lru_cache
-def _global_stock_cube(spec, lat_offset=True, lon_offset=True):
+def _global_stock_cube(
+    spec: str,
+    lat_offset: bool = True,
+    lon_offset: bool = True,
+) -> Cube:
     """Create a stock cube.
 
     Create a global cube with M degree-east by N degree-north regular grid
@@ -229,13 +240,13 @@ def _global_stock_cube(spec, lat_offset=True, lon_offset=True):
 
     Parameters
     ----------
-    spec : str
+    spec
         Specifies the 'MxN' degree cell-specification for the global grid.
-    lat_offset : bool
+    lat_offset
         Offset the grid centers of the latitude coordinate w.r.t. the
         pole by half a grid step. This argument is ignored if `target_grid`
         is a cube or file.
-    lon_offset : bool
+    lon_offset
         Offset the grid centers of the longitude coordinate w.r.t. Greenwich
         meridian by half a grid step.
         This argument is ignored if `target_grid` is a cube or file.
@@ -282,7 +293,7 @@ def _spec_to_latlonvals(
     start_longitude: float,
     end_longitude: float,
     step_longitude: float,
-) -> tuple:
+) -> tuple[np.ndarray, np.ndarray]:
     """Define lat/lon values from spec.
 
     Create a regional cube starting defined by the target specification.
@@ -292,54 +303,48 @@ def _spec_to_latlonvals(
 
     Parameters
     ----------
-    start_latitude : float
+    start_latitude:
         Latitude value of the first grid cell center (start point). The grid
         includes this value.
-    end_latitude : float
+    end_latitude:
         Latitude value of the last grid cell center (end point). The grid
         includes this value only if it falls on a grid point. Otherwise, it
         cuts off at the previous value.
-    step_latitude : float
+    step_latitude:
         Latitude distance between the centers of two neighbouring cells.
-    start_longitude : float
+    start_longitude:
         Latitude value of the first grid cell center (start point). The grid
         includes this value.
-    end_longitude : float
+    end_longitude:
         Longitude value of the last grid cell center (end point). The grid
         includes this value only if it falls on a grid point. Otherwise, it
         cuts off at the previous value.
-    step_longitude : float
+    step_longitude:
         Longitude distance between the centers of two neighbouring cells.
 
     Returns
     -------
-    xvals : np.array
+    xvals: np.array
         List of longitudes
-    yvals : np.array
+    yvals: np.array
         List of latitudes
     """
     if step_latitude == 0:
         msg = f"Latitude step cannot be 0, got step_latitude={step_latitude}."
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     if step_longitude == 0:
         msg = (
             f"Longitude step cannot be 0, got step_longitude={step_longitude}."
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     if (start_latitude < _LAT_MIN) or (end_latitude > _LAT_MAX):
         msg = (
             f"Latitude values must lie between {_LAT_MIN}:{_LAT_MAX}, "
             f"got start_latitude={start_latitude}:end_latitude={end_latitude}."
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     def get_points(start, stop, step):
         """Calculate grid points."""
@@ -354,7 +359,7 @@ def get_points(start, stop, step):
     return latitudes, longitudes
 
 
-def _regional_stock_cube(spec: dict):
+def _regional_stock_cube(spec: dict[str, Any]) -> Cube:
     """Create a regional stock cube.
 
     Returns
@@ -369,7 +374,7 @@ def _regional_stock_cube(spec: dict):
         circular=True,
     )
 
-    def add_bounds_from_step(coord, step):
+    def add_bounds_from_step(coord: Coord, step: float) -> np.ndarray:
         """Calculate bounds from the given step."""
         bound = step / 2
         points = coord.points
@@ -381,7 +386,7 @@ def add_bounds_from_step(coord, step):
     return cube
 
 
-def extract_location(cube, location, scheme):
+def extract_location(cube: Cube, location: str, scheme: str) -> Cube:
     """Extract a point using a location name, with interpolation.
 
     Extracts a single location point from a cube, according
@@ -399,20 +404,19 @@ def extract_location(cube, location, scheme):
 
     Parameters
     ----------
-    cube : cube
+    cube:
         The source cube to extract a point from.
-
-    location : str
+    location:
         The reference location. Examples: 'mount everest',
         'romania','new york, usa'
-
-    scheme : str
+    scheme:
         The interpolation scheme. 'linear' or 'nearest'. No default.
 
     Returns
     -------
-    Returns a cube with the extracted point, and with adjusted
-    latitude and longitude coordinates.
+    iris.cube.Cube
+        Returns a cube with the extracted point, and with adjusted latitude and
+        longitude coordinates.
 
     Raises
     ------
@@ -429,17 +433,13 @@ def extract_location(cube, location, scheme):
             " Examples: 'mount everest', 'romania',"
             " 'new york, usa'"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
     if scheme is None:
         msg = (
             "Interpolation scheme needs to be specified."
             " Use either 'linear' or 'nearest'."
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
     try:
         # Try to use the default SSL context, see
         # https://github.com/ESMValGroup/ESMValCore/issues/2012 for more
@@ -473,7 +473,12 @@ def extract_location(cube, location, scheme):
     )
 
 
-def extract_point(cube, latitude, longitude, scheme):
+def extract_point(
+    cube: Cube,
+    latitude: ArrayLike,
+    longitude: ArrayLike,
+    scheme: str,
+) -> Cube:
     """Extract a point, with interpolation.
 
     Extracts a single latitude/longitude point from a cube, according
@@ -493,13 +498,13 @@ def extract_point(cube, latitude, longitude, scheme):
 
     Parameters
     ----------
-    cube : cube
+    cube:
         The source cube to extract a point from.
-
-    latitude, longitude : float, or array of float
-        The latitude and longitude of the point.
-
-    scheme : str
+    latitude:
+        The latitude of the point.
+    longitude:
+        The longitude of the point.
+    scheme:
         The interpolation scheme. 'linear' or 'nearest'. No default.
 
     Returns
@@ -544,15 +549,15 @@ def extract_point(cube, latitude, longitude, scheme):
     array([ 1,  5, 17, 21, 33, 37, 49, 53])
     """
     msg = f"Unknown interpolation scheme, got {scheme!r}."
-    scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower())
-    if not scheme:
+    loaded_scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower())
+    if not loaded_scheme:
         raise ValueError(msg)
 
     point = [("latitude", latitude), ("longitude", longitude)]
-    return cube.interpolate(point, scheme=scheme)
+    return cube.interpolate(point, scheme=loaded_scheme)
 
 
-def is_dataset(dataset):
+def is_dataset(dataset: Any) -> bool:
     """Test if something is an `esmvalcore.dataset.Dataset`."""
     # Use this function to avoid circular imports
     return hasattr(dataset, "facets")
@@ -624,9 +629,7 @@ def _load_scheme(src_cube: Cube, tgt_cube: Cube, scheme: str | dict):
                 f"Regridding scheme '{scheme}' not available for {grid_type} "
                 f"data, expected one of: {', '.join(schemes)}"
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
         loaded_scheme = schemes[scheme]
 
     logger.debug("Loaded regridding scheme %s", loaded_scheme)
@@ -642,9 +645,7 @@ def _load_generic_scheme(scheme: dict):
         object_ref = scheme.pop("reference")
     except KeyError as key_err:
         msg = "No reference specified for generic regridding."
-        raise ValueError(
-            msg,
-        ) from key_err
+        raise ValueError(msg) from key_err
     module_name, separator, scheme_name = object_ref.partition(":")
     try:
         obj: Any = importlib.import_module(module_name)
@@ -654,9 +655,7 @@ def _load_generic_scheme(scheme: dict):
             f"'{module_name}'. Please double check spelling and that the "
             f"required module is installed."
         )
-        raise ValueError(
-            msg,
-        ) from import_err
+        raise ValueError(msg) from import_err
     if separator:
         for attr in scheme_name.split("."):
             obj = getattr(obj, attr)
@@ -720,7 +719,7 @@ def _get_regridder(
     return regridder
 
 
-def _get_coord_key(src_cube: Cube, tgt_cube: Cube) -> tuple:
+def _get_coord_key(src_cube: Cube, tgt_cube: Cube) -> tuple[ArrayLike, ...]:
     """Get dict key from coordinates."""
     src_lat = src_cube.coord("latitude")
     src_lon = src_cube.coord("longitude")
@@ -733,7 +732,7 @@ def _get_name_and_shape_key(
     src_cube: Cube,
     tgt_cube: Cube,
     scheme: str | dict,
-) -> tuple:
+) -> tuple[str, tuple[int, ...]]:
     """Get dict key from scheme name and coordinate shapes."""
     name = str(scheme)
     shapes = [c.shape for c in _get_coord_key(src_cube, tgt_cube)]
@@ -984,7 +983,12 @@ def _horizontal_grid_is_close(cube1: Cube, cube2: Cube) -> bool:
     return True
 
 
-def _create_cube(src_cube, data, src_levels, levels):
+def _create_cube(
+    src_cube: Cube,
+    data: ArrayLike,
+    src_levels: ArrayLike,
+    levels: ArrayLike,
+) -> Cube:
     """Generate a new cube with the interpolated data.
 
     The resultant cube is seeded with `src_cube` metadata and coordinates,
@@ -995,14 +999,14 @@ def _create_cube(src_cube, data, src_levels, levels):
 
     Parameters
     ----------
-    src_cube : cube
+    src_cube
         The source cube that was vertically interpolated.
-    data : array
+    data
         The payload resulting from interpolating the source cube
         over the specified levels.
-    src_levels : array
+    src_levels
         Vertical levels of the source data
-    levels : array
+    levels
         The vertical levels of interpolation.
 
     Returns
@@ -1074,7 +1078,7 @@ def _create_cube(src_cube, data, src_levels, levels):
 
     # Collapse the z-dimension for the scalar case.
     if levels.size == 1:
-        slicer = [slice(None)] * result.ndim
+        slicer: list[slice | int] = [slice(None)] * result.ndim
         slicer[z_dim] = 0
         result = result[tuple(slicer)]
 
@@ -1082,12 +1086,12 @@ def _create_cube(src_cube, data, src_levels, levels):
 
 
 def _vertical_interpolate(
-    cube,
-    src_levels,
-    levels,
-    interpolation,
-    extrapolation,
-):
+    cube: Cube,
+    src_levels: ArrayLike,
+    levels: ArrayLike,
+    interpolation: str,
+    extrapolation: str,
+) -> Cube:
     """Perform vertical interpolation."""
     # Determine the source levels and axis for vertical interpolation.
     (z_axis,) = cube.coord_dims(cube.coord(axis="z", dim_coords=True))
@@ -1177,12 +1181,12 @@ def _preserve_fx_vars(cube: iris.cube.Cube, result: iris.cube.Cube) -> None:
                 add_ancillary_variable(result, ancillary_cube)
 
 
-def parse_vertical_scheme(scheme):
+def parse_vertical_scheme(scheme: str) -> tuple[str, str]:
     """Parse the scheme provided for level extraction.
 
     Parameters
     ----------
-    scheme : str
+    scheme:
         The vertical interpolation scheme to use. Choose from
         'linear',
         'nearest',
@@ -1191,7 +1195,7 @@ def parse_vertical_scheme(scheme):
 
     Returns
     -------
-    (str, str)
+    tuple[str, str]
         A tuple containing the interpolation and extrapolation scheme.
     """
     # Check if valid scheme is given
@@ -1200,9 +1204,7 @@ def parse_vertical_scheme(scheme):
             f"Unknown vertical interpolation scheme, got '{scheme}', possible "
             f"schemes are {VERTICAL_SCHEMES}"
         )
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     # This allows us to put level 0. to load the ocean surface.
     extrap_scheme = "nan"
@@ -1226,7 +1228,7 @@ def extract_levels(
     coordinate: str | None = None,
     rtol: float = 1e-7,
     atol: float | None = None,
-):
+) -> Cube:
     """Perform vertical interpolation.
 
     Parameters
@@ -1344,19 +1346,19 @@ def extract_levels(
     return result
 
 
-def get_cmor_levels(cmor_table, coordinate):
+def get_cmor_levels(cmor_table: str, coordinate: str) -> list[float]:
     """Get level definition from a CMOR coordinate.
 
     Parameters
     ----------
-    cmor_table: str
+    cmor_table:
         CMOR table name
-    coordinate: str
+    coordinate:
         CMOR coordinate name
 
     Returns
     -------
-    list[int]
+    list[float]
 
     Raises
     ------
@@ -1366,15 +1368,11 @@ def get_cmor_levels(cmor_table, coordinate):
     """
     if cmor_table not in CMOR_TABLES:
         msg = f"Level definition cmor_table '{cmor_table}' not available"
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     if coordinate not in CMOR_TABLES[cmor_table].coords:
         msg = f"Coordinate {coordinate} not available for {cmor_table}"
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     cmor = CMOR_TABLES[cmor_table].coords[coordinate]
 
@@ -1387,17 +1385,15 @@ def get_cmor_levels(cmor_table, coordinate):
         f"Coordinate {coordinate} in {cmor_table} does not have requested "
         f"values"
     )
-    raise ValueError(
-        msg,
-    )
+    raise ValueError(msg)
 
 
-def get_reference_levels(dataset):
+def get_reference_levels(dataset: Dataset) -> list[float]:
     """Get level definition from a reference dataset.
 
     Parameters
     ----------
-    dataset: esmvalcore.dataset.Dataset
+    dataset:
         Dataset containing the reference files.
 
     Returns
@@ -1423,7 +1419,11 @@ def get_reference_levels(dataset):
 
 
 @preserve_float_dtype
-def extract_coordinate_points(cube, definition, scheme):
+def extract_coordinate_points(
+    cube: Cube,
+    definition: dict[str, ArrayLike],
+    scheme: str,
+) -> Cube:
     """Extract points from any coordinate with interpolation.
 
     Multiple points can also be extracted, by supplying an array of
@@ -1434,11 +1434,11 @@ def extract_coordinate_points(cube, definition, scheme):
 
     Parameters
     ----------
-    cube : cube
+    cube:
         The source cube to extract a point from.
-    definition : dict(str, float or array of float)
+    definition:
         The coordinate - values pairs to extract
-    scheme : str
+    scheme:
         The interpolation scheme. 'linear' or 'nearest'. No default.
 
     Returns
@@ -1455,7 +1455,7 @@ def extract_coordinate_points(cube, definition, scheme):
         If the interpolation scheme is not provided or is not recognised.
     """
     msg = f"Unknown interpolation scheme, got {scheme!r}."
-    scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower())
-    if not scheme:
+    loaded_scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower())
+    if not loaded_scheme:
         raise ValueError(msg)
-    return cube.interpolate(definition.items(), scheme=scheme)
+    return cube.interpolate(definition.items(), scheme=loaded_scheme)

From d10de1edcf2f4b3eab722cdda9f8283af00a4944 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 18:18:01 +0200
Subject: [PATCH 15/85] Make new dataset methods private

---
 esmvalcore/dataset.py      | 10 +++++-----
 tests/unit/test_dataset.py |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index d471dde5d8..1391916637 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -136,7 +136,7 @@ def __init__(self, **facets: FacetValue) -> None:
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
 
-        if not self.is_derived() and self.facets.get(
+        if not self._is_derived() and self.facets.get(
             "force_derivation",
             False,
         ):
@@ -174,14 +174,14 @@ def from_recipe(
 
         return datasets_from_recipe(recipe, session)
 
-    def is_derived(self) -> bool:
+    def _is_derived(self) -> bool:
         """Return ``True`` for derived variables, ``False`` otherwise."""
         return bool(self.facets.get("derive", False))
 
-    def derivation_necessary(self) -> bool:
+    def _derivation_necessary(self) -> bool:
         """Return ``True`` if derivation is necessary, ``False`` otherwise."""
         # If variable cannot be derived, derivation is not necessary
-        if not self.is_derived():
+        if not self._is_derived():
             return False
 
         # If forced derivation is requested, derivation is necessary
@@ -651,7 +651,7 @@ def add_supplementary(self, **facets: FacetValue) -> None:
         **facets
             Facets describing the supplementary variable.
         """
-        if self.is_derived():
+        if self._is_derived():
             facets.setdefault("derive", False)
         if self.facets.get("force_derivation", False):
             facets.setdefault("force_derivation", False)
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 9624f697f6..5ae4b6d520 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2147,7 +2147,7 @@ def test_derivation_necessary_no_derivation():
         type="sat",
         timerange="1980/2000",
     )
-    assert not dataset.derivation_necessary()
+    assert not dataset._derivation_necessary()
 
 
 def test_derivation_necessary_no_force_derivation_no_files():
@@ -2161,7 +2161,7 @@ def test_derivation_necessary_no_force_derivation_no_files():
         timerange="1980/2000",
         derive=True,
     )
-    assert dataset.derivation_necessary()
+    assert dataset._derivation_necessary()
 
 
 def test_derivation_necessary_no_force_derivation(tmp_path, session):
@@ -2182,7 +2182,7 @@ def test_derivation_necessary_no_force_derivation(tmp_path, session):
         input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
     )
     asr_file.touch()
-    assert not dataset.derivation_necessary()
+    assert not dataset._derivation_necessary()
 
 
 def test_derivation_necessary_force_derivation(tmp_path, session):
@@ -2204,7 +2204,7 @@ def test_derivation_necessary_force_derivation(tmp_path, session):
         input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
     )
     asr_file.touch()
-    assert dataset.derivation_necessary()
+    assert dataset._derivation_necessary()
 
 
 def test_force_derivation_no_derived():

From 732386623abd9f38577f977255a02226300f071f Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 18:27:31 +0200
Subject: [PATCH 16/85] Small fix

---
 esmvalcore/_recipe/recipe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index e14d9201e2..e34c7fc52b 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -97,9 +97,9 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str:
                 )
             )
             raise RecipeError(msg)
-        dataset_name = str(facets[special_name])
+        special_name = str(facets[special_name])
 
-    return dataset_name
+    return special_name
 
 
 def _update_target_levels(

From 3ab2cdfa1ee7e8f7177bce35643983ac92264697 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 18:36:54 +0200
Subject: [PATCH 17/85] Fix test

---
 tests/integration/preprocessor/_derive/test_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/preprocessor/_derive/test_interface.py b/tests/integration/preprocessor/_derive/test_interface.py
index 4c11466fcf..8491c0eb2c 100644
--- a/tests/integration/preprocessor/_derive/test_interface.py
+++ b/tests/integration/preprocessor/_derive/test_interface.py
@@ -202,7 +202,7 @@ def test_get_required_with_fx():
 
     reference = [
         {"short_name": "thetao"},
-        {"short_name": "volcello", "mip": "fx"},
+        {"short_name": "volcello", "mip": "fx", "ensemble": "r0i0p0"},
     ]
 
     assert variables == reference

From 099349f5575177eee6220525af844791b5fe1504 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 18:39:02 +0200
Subject: [PATCH 18/85] Fix mock

---
 tests/integration/preprocessor/_derive/test_interface.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/integration/preprocessor/_derive/test_interface.py b/tests/integration/preprocessor/_derive/test_interface.py
index 8491c0eb2c..502ef3ba75 100644
--- a/tests/integration/preprocessor/_derive/test_interface.py
+++ b/tests/integration/preprocessor/_derive/test_interface.py
@@ -22,7 +22,10 @@ def mock_cubes():
 @pytest.fixture
 def patched_derive(mocker):
     """Fixture for mocked derivation scripts."""
-    mocker.patch("iris.cube.CubeList", side_effect=lambda x: x)
+    mocker.patch(
+        "esmvalcore.preprocessor._derive.CubeList",
+        side_effect=lambda x: x,
+    )
     mocker.patch.object(_derive, "ALL_DERIVED_VARIABLES", autospec=True)
     mocker.patch.object(_derive, "logger", autospec=True)
 

From 86b308b58b798ea0fe4ab13a29060b738e3f9a38 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Sun, 13 Jul 2025 18:47:48 +0200
Subject: [PATCH 19/85] 100% test coverage

---
 .../unit/preprocessor/_derive/test_vegfrac.py | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 tests/unit/preprocessor/_derive/test_vegfrac.py

diff --git a/tests/unit/preprocessor/_derive/test_vegfrac.py b/tests/unit/preprocessor/_derive/test_vegfrac.py
new file mode 100644
index 0000000000..6c86037e23
--- /dev/null
+++ b/tests/unit/preprocessor/_derive/test_vegfrac.py
@@ -0,0 +1,23 @@
+"""Test derivation of `vegfrac`."""
+
+from esmvalcore.preprocessor._derive import vegfrac
+
+
+def test_vegfrac_required_cmip5():
+    derived_var = vegfrac.DerivedVariable()
+    output = derived_var.required("CMIP5")
+    assert output == [
+        {"short_name": "baresoilFrac"},
+        {"short_name": "residualFrac"},
+        {"short_name": "sftlf", "mip": "fx", "ensemble": "r0i0p0"},
+    ]
+
+
+def test_vegfrac_required_cmip6():
+    derived_var = vegfrac.DerivedVariable()
+    output = derived_var.required("CMIP6")
+    assert output == [
+        {"short_name": "baresoilFrac"},
+        {"short_name": "residualFrac"},
+        {"short_name": "sftlf", "mip": "fx"},
+    ]

From 369a8114699c6d06bb731a060025bedff7ea34cd Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 14 Jul 2025 09:11:57 +0200
Subject: [PATCH 20/85] Clean doc

---
 doc/develop/derivation.rst   | 34 ++++++++++++++++++++++++----------
 doc/quickstart/configure.rst |  6 +++---
 doc/recipe/preprocessor.rst  |  8 +-------
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/doc/develop/derivation.rst b/doc/develop/derivation.rst
index 9d097ff843..c8516d8414 100644
--- a/doc/develop/derivation.rst
+++ b/doc/develop/derivation.rst
@@ -14,6 +14,11 @@ A typical example looks like this:
 .. code-block:: py
 
    """Derivation of variable `dummy`."""
+
+   from iris.cube import Cube, CubeList
+
+   from esmvalcore.typing import Facets, FacetValue
+
    from ._baseclass import DerivedVariableBase
 
 
@@ -21,19 +26,19 @@ A typical example looks like this:
        """Derivation of variable `dummy`."""
 
        @staticmethod
-       def required(project):
+       def required(project: FacetValue) -> list[Facets]:
            """Declare the variables needed for derivation."""
-           mip = 'fx'
-           if project == 'CMIP6':
-               mip = 'Ofx'
+           mip = "fx"
+           if project == "CMIP6":
+               mip = "Ofx"
            required = [
-               {'short_name': 'var_a'},
-               {'short_name': 'var_b', 'mip': mip, 'optional': True},
+               {"short_name": "var_a"},
+               {"short_name": "var_b", "mip": mip, "optional": True},
            ]
            return required
 
        @staticmethod
-       def calculate(cubes):
+       def calculate(cubes: CubeList) -> Cube:
            """Compute `dummy`."""
 
            # `cubes` is a CubeList containing all required variables.
@@ -42,7 +47,7 @@ A typical example looks like this:
            # Return single cube at the end
            return cube
 
-The static function ``required(project)`` returns a ``list`` of ``dict``
+The static function ``required(project)`` returns a :obj:`list` of :obj:`~esmvalcore.typing.Facets`
 containing all required variables for deriving the derived variable. Its only
 argument is the ``project`` of the specific dataset. In this particular
 example script, the derived variable ``dummy`` is derived from ``var_a`` and
@@ -56,5 +61,14 @@ Otherwise, the tool will fail if not all required variables are available for
 all datasets.
 
 The actual derivation takes place in the static function ``calculate(cubes)``
-which returns a single ``cube`` containing the derived variable. Its only
-argument ``cubes`` is a ``CubeList`` containing all required variables.
+which returns a single :class:`~iris.cube.Cube` containing the derived
+variable. Its only argument ``cubes`` is a :class:`~iris.cube.CubeList`
+containing all required variables.
+
+If no MIP table entry for the derived variable exists for the given ``mip``,
+the tool will also look in other ``mip`` tables for the same ``project`` to find
+the definition of derived variables. To contribute a completely new derived
+variable, it is necessary to define a name for it and to provide the
+corresponding CMOR table. This is to guarantee the proper metadata definition
+is attached to the derived data. Such custom CMOR tables are collected as part
+of the `ESMValCore package <https://github.com/ESMValGroup/ESMValCore/tree/main/esmvalcore/cmor/tables/custom>`_.
diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst
index 7a78f850b3..d20d67c8e2 100644
--- a/doc/quickstart/configure.rst
+++ b/doc/quickstart/configure.rst
@@ -1011,7 +1011,7 @@ related to CMOR table settings available:
   from the ``esmvalcore/cmor/tables/custom`` directory) and it is possible to
   use variables with a ``mip`` which is different from the MIP table in which
   they are defined. Note that this option is always enabled for
-  :ref:`derived <Variable derivation>` variables.
+  :ref:`derived variables <Variable derivation>`.
 * ``cmor_path``: path to the CMOR table.
   Relative paths are with respect to `esmvalcore/cmor/tables`_.
   Defaults to the value provided in ``cmor_type`` written in lower case.
@@ -1026,8 +1026,8 @@ Custom CMOR tables
 
 As mentioned in the previous section, the CMOR tables of projects that use
 ``cmor_strict: false`` will be extended with custom CMOR tables.
-For derived variables (the ones with ``derive: true`` in the recipe), the
-custom CMOR tables will always be considered.
+For :ref:`derived variables <Variable derivation>` (the ones with ``derive:
+true`` in the recipe), the custom CMOR tables will always be considered.
 By default, these custom tables are loaded from `esmvalcore/cmor/tables/custom
 <https://github.com/ESMValGroup/ESMValCore/tree/main/esmvalcore/cmor/tables/custom>`_.
 However, by using the special project ``custom`` in the
diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst
index bedec06cbe..a1d603f289 100644
--- a/doc/recipe/preprocessor.rst
+++ b/doc/recipe/preprocessor.rst
@@ -196,17 +196,11 @@ case of this operation is the evaluation of a variable which is only available
 in an observational dataset but not in the models. In this case a derivation
 function is provided by the ESMValCore in order to calculate the variable and
 perform the comparison. For example, several observational datasets deliver
-total column ozone as observed variable (`toz`), but CMIP models only provide
+total column ozone as observed variable (``toz``), but CMIP models only provide
 the ozone 3D field. In this case, a derivation function is provided to
 vertically integrate the ozone and obtain total column ozone for direct
 comparison with the observations.
 
-The tool will also look in other ``mip`` tables for the same ``project`` to find
-the definition of derived variables. To contribute a completely new derived
-variable, it is necessary to define a name for it and to provide the
-corresponding CMOR table. This is to guarantee the proper metadata definition
-is attached to the derived data. Such custom CMOR tables are collected as part
-of the `ESMValCore package <https://github.com/ESMValGroup/ESMValCore/tree/main/esmvalcore/cmor/tables/custom>`_.
 By default, the variable derivation will be applied only if the variable is not
 already available in the input data, but the derivation can be forced by
 setting the ``force_derivation`` flag.

From c2a3d81e4b7a3ba86bc27d1936d37eebbcaae53d Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 14 Jul 2025 10:12:43 +0200
Subject: [PATCH 21/85] 100% diff coverage

---
 .../preprocessor/_derive/test_interface.py    |  6 ++
 tests/integration/recipe/test_recipe.py       | 86 +++++++++++++++++++
 tests/integration/test_local.py               | 16 +++-
 .../_multimodel/test_multimodel.py            | 33 ++++++-
 4 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/tests/integration/preprocessor/_derive/test_interface.py b/tests/integration/preprocessor/_derive/test_interface.py
index 502ef3ba75..28f41f8693 100644
--- a/tests/integration/preprocessor/_derive/test_interface.py
+++ b/tests/integration/preprocessor/_derive/test_interface.py
@@ -211,6 +211,12 @@ def test_get_required_with_fx():
     assert variables == reference
 
 
+def test_get_required_invalid_var():
+    msg = r"Cannot derive variable '_invalid_var_'"
+    with pytest.raises(NotImplementedError, match=msg):
+        get_required("_invalid_var_", "CMIP5")
+
+
 def test_derive_nonstandard_nofx():
     """Test a specific derivation."""
     short_name = "alb"
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index f45c547884..0ab6e12637 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -1,3 +1,4 @@
+import inspect
 import os
 import re
 from collections import defaultdict
@@ -1542,6 +1543,42 @@ def test_diagnostic_task_provenance(
     assert os.path.exists(prefix + ".xml")
 
 
+def test_invalid_diagnostcic_ancestor(
+    tmp_path,
+    patched_datafinder,
+    session,
+):
+    content = dedent(
+        """
+        diagnostics:
+          diagnostic_name:
+            themes:
+              - phys
+            realms:
+              - atmos
+            variables:
+              tas:
+                project: CMIP5
+                mip: Amon
+                exp: historical
+                timerange: 2000/2005
+                ensemble: r1i1p1
+                additional_datasets:
+                  - dataset: CanESM2
+            scripts:
+              script_name:
+                script: examples/diagnostic.py
+              script_name2:
+                script: examples/diagnostic.py
+                ancestors: [invalid_*]
+        """,
+    )
+
+    msg = r"Could not find any ancestors matching"
+    with pytest.raises(RecipeError, match=msg):
+        get_recipe(tmp_path, content, session)
+
+
 def test_alias_generation(tmp_path, patched_datafinder, session):  # noqa: C901, PLR0912
     content = dedent("""
         diagnostics:
@@ -2892,6 +2929,55 @@ def test_statistics_missing_operator_no_default_fail(
         get_recipe(tmp_path, content, session)
 
 
+def test_check_preprocessor_settings_last_resort(
+    mocker,
+    tmp_path,
+    caplog,
+    patched_datafinder,
+    session,
+):
+    # Create mock so that no errors during the regular preprocessor parameter
+    # checks are raised, but only during the last sanity check
+    def raise_exc():
+        msg = "type error"
+        raise TypeError(msg)
+
+    mock_args = mocker.Mock(name="args", kind=inspect.Parameter.VAR_POSITIONAL)
+    mock_bind = mocker.Mock(side_effect=raise_exc)
+    mock_signature = mocker.Mock(
+        parameters={"args": mock_args},
+        bind=mock_bind,
+    )
+    mocker.patch(
+        "inspect.signature",
+        autospec=True,
+        return_value=mock_signature,
+    )
+    content = dedent("""
+        diagnostics:
+          diagnostic_name:
+            variables:
+              chl_default:
+                short_name: chl
+                mip: Oyr
+                timerange: '2000/2010'
+                additional_datasets:
+                  - project: CMIP5
+                    dataset: CanESM2
+                    exp: historical
+                    ensemble: r1i1p1
+            scripts: null
+        """)
+    with pytest.raises(TypeError):
+        get_recipe(tmp_path, content, session)
+    log_errors = [r.message for r in caplog.records if r.levelname == "ERROR"]
+    msg = (
+        "Wrong preprocessor function arguments in function "
+        "'remove_supplementary_variables'"
+    )
+    assert msg in log_errors
+
+
 @pytest.mark.parametrize(
     ("preproc", "option"),
     [
diff --git a/tests/integration/test_local.py b/tests/integration/test_local.py
index 839c0d159b..e2dae85dff 100644
--- a/tests/integration/test_local.py
+++ b/tests/integration/test_local.py
@@ -8,7 +8,12 @@
 import yaml
 
 from esmvalcore.config import CFG
-from esmvalcore.local import LocalFile, _get_output_file, find_files
+from esmvalcore.local import (
+    LocalFile,
+    _get_output_file,
+    _select_drs,
+    find_files,
+)
 
 # Load test configuration
 with open(
@@ -124,3 +129,12 @@ def test_find_files_with_facets(monkeypatch, root):
     assert sorted([Path(f) for f in input_filelist]) == sorted(ref_files)
     assert isinstance(input_filelist[0], LocalFile)
     assert input_filelist[0].facets
+
+
+def test_select_invalid_drs_structure():
+    msg = (
+        r"drs _INVALID_STRUCTURE_ for CMIP6 project not specified in "
+        r"config-developer file"
+    )
+    with pytest.raises(KeyError, match=msg):
+        _select_drs("input_dir", "CMIP6", "_INVALID_STRUCTURE_")
diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py
index 1fd510eb50..7e497f4a1c 100644
--- a/tests/unit/preprocessor/_multimodel/test_multimodel.py
+++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py
@@ -15,7 +15,10 @@
 
 import esmvalcore.preprocessor._multimodel as mm
 from esmvalcore.iris_helpers import date2num
-from esmvalcore.preprocessor import multi_model_statistics
+from esmvalcore.preprocessor import (
+    _check_multi_model_settings,
+    multi_model_statistics,
+)
 from esmvalcore.preprocessor._supplementary_vars import add_ancillary_variable
 
 SPAN_OPTIONS = ("overlap", "full")
@@ -835,11 +838,21 @@ def test_unify_time_coordinates():
 class PreprocessorFile:
     """Mockup to test output of multimodel."""
 
-    def __init__(self, cube=None, attributes=None):
+    def __init__(
+        self,
+        cube=None,
+        attributes=None,
+        filename=None,
+        settings=None,
+    ):
         if cube:
             self.cubes = [cube]
         if attributes:
             self.attributes = attributes
+        if filename:
+            self.filename = filename
+        if settings:
+            self.settings = settings
 
     def wasderivedfrom(self, product):
         pass
@@ -1698,3 +1711,19 @@ def test_get_operator_and_kwargs_operator_missing(statistic):
 def test_get_stat_identifier(statistic, output):
     """Test ``_get_stat_identifier``."""
     assert mm._get_stat_identifier(statistic) == output
+
+
+def test_differing_multi_model_settings():
+    products = [
+        PreprocessorFile(
+            filename="a",
+            settings={"multi_model_statistics": {"statistics": ["mean"]}},
+        ),
+        PreprocessorFile(
+            filename="b",
+            settings={"multi_model_statistics": {"statistics": ["median"]}},
+        ),
+    ]
+    msg = r"Unable to combine differing multi-dataset settings for a and b"
+    with pytest.raises(ValueError, match=msg):
+        _check_multi_model_settings(products)

From a3dab123cfdd2f6bc2f04b6eec2e4c5fcac6b9bf Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 14 Jul 2025 10:23:46 +0200
Subject: [PATCH 22/85] Try to please Codacy

---
 esmvalcore/dataset.py    | 4 ++--
 esmvalcore/exceptions.py | 4 ++--
 esmvalcore/local.py      | 3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 1391916637..9232f00d6c 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -498,7 +498,7 @@ def __repr__(self) -> str:
             "short_name",
         )
 
-        def facets2str(facets):
+        def facets2str(facets: Facets) -> str:
             view = {k: facets[k] for k in first_keys if k in facets}
             for key, value in sorted(facets.items()):
                 if key not in first_keys:
@@ -557,7 +557,7 @@ def summary(self, shorten: bool = False) -> str:
         title = self.__class__.__name__
         txt = f"{title}: " + self._get_joined_summary_facets(", ")
 
-        def supplementary_summary(dataset):
+        def supplementary_summary(dataset: Dataset) -> str:
             return ", ".join(
                 str(dataset.facets[k])
                 for k in self._SUMMARY_FACETS
diff --git a/esmvalcore/exceptions.py b/esmvalcore/exceptions.py
index f57dcbaaa2..bdabb4f903 100644
--- a/esmvalcore/exceptions.py
+++ b/esmvalcore/exceptions.py
@@ -35,10 +35,10 @@ class InvalidConfigParameter(Error, SuppressedError):
 class RecipeError(Error):
     """Recipe contains an error."""
 
-    def __init__(self, msg):
+    def __init__(self, msg: str) -> None:
         super().__init__(msg)
         self.message = msg
-        self.failed_tasks = []
+        self.failed_tasks: list[RecipeError] = []
 
 
 class InputFilesNotFound(RecipeError):
diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index df0a0df225..9b30df924b 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -325,7 +325,7 @@ def _truncate_dates(date: str, file_date: str) -> tuple[int, int]:
 
 def _select_files(
     filenames: list[LocalFile],
-    timerange: list,
+    timerange: FacetValue,
 ) -> list[LocalFile]:
     """Select files containing data between a given timerange.
 
@@ -335,6 +335,7 @@ def _select_files(
     Otherwise, the file selection occurs taking into account the time
     resolution of the file.
     """
+    timerange = str(timerange)
     if "*" in timerange:
         # TODO: support * combined with a period
         return filenames

From 001eafa3ddc02beeb7b2d959e9d3706d95a19f4a Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 14 Jul 2025 10:30:49 +0200
Subject: [PATCH 23/85] Make tests work without ESMValTool installation

---
 tests/integration/recipe/test_recipe.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index 0ab6e12637..b127bcc3e7 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -1548,8 +1548,10 @@ def test_invalid_diagnostcic_ancestor(
     patched_datafinder,
     session,
 ):
+    script = tmp_path / "diagnostic.py"
+    script.write_text("")
     content = dedent(
-        """
+        f"""
         diagnostics:
           diagnostic_name:
             themes:
@@ -1567,9 +1569,9 @@ def test_invalid_diagnostcic_ancestor(
                   - dataset: CanESM2
             scripts:
               script_name:
-                script: examples/diagnostic.py
+                script: {script}
               script_name2:
-                script: examples/diagnostic.py
+                script: {script}
                 ancestors: [invalid_*]
         """,
     )

From debd589c229f31ab6a74c17fb71c423d3598a0e5 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 14 Jul 2025 11:03:32 +0200
Subject: [PATCH 24/85] 100% diff coverage for real

---
 esmvalcore/preprocessor/__init__.py             |  2 +-
 .../unit/preprocessor/test_preprocessor_file.py | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index bbc8313127..3f275fb439 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -554,7 +554,7 @@ def check(self) -> None:
     def apply(self, step: str, debug: bool = False) -> None:
         """Apply preprocessor step to product."""
         if step not in self.settings:
-            msg = f"PreprocessorFile {self} has no settings for step {step}"
+            msg = f"{self} has no settings for step {step}"
             raise ValueError(msg)
         self.cubes = preprocess(
             self.cubes,
diff --git a/tests/unit/preprocessor/test_preprocessor_file.py b/tests/unit/preprocessor/test_preprocessor_file.py
index 82fbdf7522..9d9b1f1a60 100644
--- a/tests/unit/preprocessor/test_preprocessor_file.py
+++ b/tests/unit/preprocessor/test_preprocessor_file.py
@@ -169,3 +169,20 @@ def test_save(mock_preprocess):
         ),
         mock.call().__getitem__(0),
     ]
+
+
+def test_apply_invalid_settings():
+    product = PreprocessorFile(filename=Path("test"), settings={})
+    msg = r"PreprocessorFile: test has no settings for step invalid_step"
+    with pytest.raises(ValueError, match=msg):
+        product.apply("invalid_step")
+
+
+@pytest.mark.parametrize(
+    ("cubes", "output"),
+    [(None, True), (CubeList([]), False)],
+)
+def test_is_closed(cubes, output):
+    product = PreprocessorFile(filename=Path("test"))
+    product.cubes = cubes
+    assert product.is_closed is output

From c3df13eb3a5d2e1cb00992aaf6ce4ea967ae08f3 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 09:49:15 +0200
Subject: [PATCH 25/85] Added Dataset.input_datasets

---
 esmvalcore/dataset.py      |  47 ++++++++++
 tests/unit/test_dataset.py | 173 ++++++++++++++++++++++---------------
 2 files changed, 149 insertions(+), 71 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 9232f00d6c..90a470a376 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -33,6 +33,7 @@
     _get_start_end_date,
 )
 from esmvalcore.preprocessor import preprocess
+from esmvalcore.preprocessor._derive import get_required
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Sequence
@@ -132,6 +133,7 @@ def __init__(self, **facets: FacetValue) -> None:
         self._session: Session | None = None
         self._files: Sequence[File] | None = None
         self._file_globs: Sequence[Path] | None = None
+        self._input_datasets: list[Dataset] | None = None
 
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
@@ -194,6 +196,51 @@ def _derivation_necessary(self) -> bool:
         ds_copy.supplementaries = []
         return not ds_copy.files
 
+    def _get_input_datasets(self) -> list[Dataset]:
+        """Get input datasets."""
+        input_datasets: list[Dataset] = []
+        required_vars_facets = get_required(
+            self.facets["short_name"],
+            self.facets["project"],
+        )
+
+        for required_facets in required_vars_facets:
+            input_dataset = self._copy(derive=False, force_derivation=False)
+            keep = {"alias", "recipe_dataset_index", *self.minimal_facets}
+            input_dataset.facets = {
+                k: v for k, v in input_dataset.facets.items() if k in keep
+            }
+            input_dataset.facets.update(required_facets)
+            input_dataset.augment_facets()
+            input_datasets.append(input_dataset)
+
+        return input_datasets
+
+    @property
+    def input_datasets(self) -> list[Dataset]:
+        """Get input datasets.
+
+        For non-derived variables (i.e., those with facet ``derive=False``),
+        this will simply return the dataset itself in a list.
+
+        For derived variables (i.e., those with facet ``derive=True``), this
+        will return the datasets required for derivation if derivation is
+        necessary, and the dataset itself if derivation is not necessary.
+        Derivation is necessary if the facet ``force_derivation=True`` is set
+        or no files for the dataset itself are available.
+
+        """
+        if self._input_datasets is not None:
+            return self._input_datasets
+
+        if not self._derivation_necessary():
+            input_datasets = [self]
+        else:
+            input_datasets = self._get_input_datasets()
+
+        self._input_datasets = input_datasets
+        return input_datasets
+
     def _file_to_dataset(
         self,
         file: esgf.ESGFFile | local.LocalFile,
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 5ae4b6d520..a3c2f35359 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2137,74 +2137,57 @@ def test_get_extra_facets_native6():
     }
 
 
+OBS6_SAT_FACETS = {
+    "project": "OBS6",
+    "dataset": "SAT",
+    "mip": "Amon",
+    "tier": 2,
+    "type": "sat",
+    "timerange": "1980/2000",
+}
+
+
 def test_derivation_necessary_no_derivation():
-    dataset = Dataset(
-        project="OBS6",
-        dataset="SAT",
-        mip="Amon",
-        short_name="tas",
-        tier=2,
-        type="sat",
-        timerange="1980/2000",
-    )
-    assert not dataset._derivation_necessary()
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    assert dataset._derivation_necessary() is False
 
 
 def test_derivation_necessary_no_force_derivation_no_files():
-    dataset = Dataset(
-        project="OBS6",
-        dataset="SAT",
-        mip="Amon",
-        short_name="asr",
-        tier=2,
-        type="sat",
-        timerange="1980/2000",
-        derive=True,
-    )
-    assert dataset._derivation_necessary()
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    assert dataset._derivation_necessary() is True
 
 
 def test_derivation_necessary_no_force_derivation(tmp_path, session):
-    dataset = Dataset(
-        project="OBS6",
-        dataset="SAT",
-        mip="Amon",
-        short_name="asr",
-        tier=2,
-        type="sat",
-        timerange="1980/2000",
-        derive=True,
-    )
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.session = session
+
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    asr_file = esmvalcore.local.LocalFile(
-        input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    asr_file.touch()
-    assert not dataset._derivation_necessary()
+    lwcre.touch()
+
+    assert dataset._derivation_necessary() is False
 
 
 def test_derivation_necessary_force_derivation(tmp_path, session):
     dataset = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
+        **OBS6_SAT_FACETS,
         short_name="lwcre",
-        exp="historical",
-        grid="gn",
-        ensemble="r1i1p1f1",
         derive=True,
         force_derivation=True,
     )
     dataset.session = session
+
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    asr_file = esmvalcore.local.LocalFile(
-        input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
+    lwcre_file = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    asr_file.touch()
-    assert dataset._derivation_necessary()
+    lwcre_file.touch()
+
+    assert dataset._derivation_necessary() is True
 
 
 def test_force_derivation_no_derived():
@@ -2214,19 +2197,11 @@ def test_force_derivation_no_derived():
     )
 
     with pytest.raises(ValueError, match=msg):
-        Dataset(
-            project="CMIP6",
-            dataset="CanESM5",
-            mip="Amon",
-            short_name="tas",
-            force_derivation=True,
-        )
+        Dataset(**OBS6_SAT_FACETS, short_name="tas", force_derivation=True)
 
     with pytest.raises(ValueError, match=msg):
         Dataset(
-            project="CMIP6",
-            dataset="CanESM5",
-            mip="Amon",
+            **OBS6_SAT_FACETS,
             short_name="tas",
             derive=False,
             force_derivation=True,
@@ -2235,21 +2210,17 @@ def test_force_derivation_no_derived():
 
 def test_add_supplementary_to_derived():
     dataset = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
+        **OBS6_SAT_FACETS,
         short_name="lwcre",
         derive=True,
         force_derivation=True,
     )
 
-    dataset.add_supplementary(short_name="areacella", mip="fx")
+    dataset.add_supplementary(short_name="pr")
 
     expected_supplementary = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="fx",
-        short_name="areacella",
+        **OBS6_SAT_FACETS,
+        short_name="pr",
         derive=False,
         force_derivation=False,
     )
@@ -2258,26 +2229,86 @@ def test_add_supplementary_to_derived():
 
 def test_add_derived_supplementary_to_derived():
     dataset = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
+        **OBS6_SAT_FACETS,
         short_name="lwcre",
         derive=True,
         force_derivation=True,
     )
 
     dataset.add_supplementary(
-        short_name="asr",
+        short_name="swcre",
         derive=True,
         force_derivation=True,
     )
 
     expected_supplementary = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
-        short_name="asr",
+        **OBS6_SAT_FACETS,
+        short_name="swcre",
         derive=True,
         force_derivation=True,
     )
     assert dataset.supplementaries[0] == expected_supplementary
+
+
+def test_input_datasets_derivation():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+
+    expected_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_dataset in expected_datasets:
+        expected_dataset.session = dataset.session
+
+    assert dataset.input_datasets == expected_datasets
+
+
+def test_input_datasets_no_derivation():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    dataset.add_supplementary(short_name="pr")
+
+    assert dataset.input_datasets == [dataset]
+
+
+def test_input_datasets_no_force_derivation(tmp_path, session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre.touch()
+
+    assert dataset.input_datasets == [dataset]
+
+
+def test_input_datasets_no_derivation_available():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True)
+
+    msg = r"Cannot derive variable 'tas': no derivation script available"
+    with pytest.raises(NotImplementedError, match=msg):
+        dataset.input_datasets  # noqa: B018

From e7948173aa83ef1a55256264769855d81bc89f25 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 09:50:24 +0200
Subject: [PATCH 26/85] Shorter code

---
 tests/unit/test_dataset.py | 109 +++++++++++++------------------------
 1 file changed, 38 insertions(+), 71 deletions(-)

diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 5ae4b6d520..149bdb11bc 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2137,74 +2137,57 @@ def test_get_extra_facets_native6():
     }
 
 
+OBS6_SAT_FACETS = {
+    "project": "OBS6",
+    "dataset": "SAT",
+    "mip": "Amon",
+    "tier": 2,
+    "type": "sat",
+    "timerange": "1980/2000",
+}
+
+
 def test_derivation_necessary_no_derivation():
-    dataset = Dataset(
-        project="OBS6",
-        dataset="SAT",
-        mip="Amon",
-        short_name="tas",
-        tier=2,
-        type="sat",
-        timerange="1980/2000",
-    )
-    assert not dataset._derivation_necessary()
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    assert dataset._derivation_necessary() is False
 
 
 def test_derivation_necessary_no_force_derivation_no_files():
-    dataset = Dataset(
-        project="OBS6",
-        dataset="SAT",
-        mip="Amon",
-        short_name="asr",
-        tier=2,
-        type="sat",
-        timerange="1980/2000",
-        derive=True,
-    )
-    assert dataset._derivation_necessary()
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    assert dataset._derivation_necessary() is True
 
 
 def test_derivation_necessary_no_force_derivation(tmp_path, session):
-    dataset = Dataset(
-        project="OBS6",
-        dataset="SAT",
-        mip="Amon",
-        short_name="asr",
-        tier=2,
-        type="sat",
-        timerange="1980/2000",
-        derive=True,
-    )
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.session = session
+
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    asr_file = esmvalcore.local.LocalFile(
-        input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    asr_file.touch()
-    assert not dataset._derivation_necessary()
+    lwcre.touch()
+
+    assert dataset._derivation_necessary() is False
 
 
 def test_derivation_necessary_force_derivation(tmp_path, session):
     dataset = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
+        **OBS6_SAT_FACETS,
         short_name="lwcre",
-        exp="historical",
-        grid="gn",
-        ensemble="r1i1p1f1",
         derive=True,
         force_derivation=True,
     )
     dataset.session = session
+
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    asr_file = esmvalcore.local.LocalFile(
-        input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc",
+    lwcre_file = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    asr_file.touch()
-    assert dataset._derivation_necessary()
+    lwcre_file.touch()
+
+    assert dataset._derivation_necessary() is True
 
 
 def test_force_derivation_no_derived():
@@ -2214,19 +2197,11 @@ def test_force_derivation_no_derived():
     )
 
     with pytest.raises(ValueError, match=msg):
-        Dataset(
-            project="CMIP6",
-            dataset="CanESM5",
-            mip="Amon",
-            short_name="tas",
-            force_derivation=True,
-        )
+        Dataset(**OBS6_SAT_FACETS, short_name="tas", force_derivation=True)
 
     with pytest.raises(ValueError, match=msg):
         Dataset(
-            project="CMIP6",
-            dataset="CanESM5",
-            mip="Amon",
+            **OBS6_SAT_FACETS,
             short_name="tas",
             derive=False,
             force_derivation=True,
@@ -2235,21 +2210,17 @@ def test_force_derivation_no_derived():
 
 def test_add_supplementary_to_derived():
     dataset = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
+        **OBS6_SAT_FACETS,
         short_name="lwcre",
         derive=True,
         force_derivation=True,
     )
 
-    dataset.add_supplementary(short_name="areacella", mip="fx")
+    dataset.add_supplementary(short_name="pr")
 
     expected_supplementary = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="fx",
-        short_name="areacella",
+        **OBS6_SAT_FACETS,
+        short_name="pr",
         derive=False,
         force_derivation=False,
     )
@@ -2258,25 +2229,21 @@ def test_add_supplementary_to_derived():
 
 def test_add_derived_supplementary_to_derived():
     dataset = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
+        **OBS6_SAT_FACETS,
         short_name="lwcre",
         derive=True,
         force_derivation=True,
     )
 
     dataset.add_supplementary(
-        short_name="asr",
+        short_name="swcre",
         derive=True,
         force_derivation=True,
     )
 
     expected_supplementary = Dataset(
-        project="CMIP6",
-        dataset="CanESM5",
-        mip="Amon",
-        short_name="asr",
+        **OBS6_SAT_FACETS,
+        short_name="swcre",
         derive=True,
         force_derivation=True,
     )

From b971d50ce19646988a482dc58e066e272ebe8d3e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 10:12:14 +0200
Subject: [PATCH 27/85] Dataset.set_version can handle derived variables now

---
 esmvalcore/_recipe/recipe.py            | 19 +-------
 esmvalcore/dataset.py                   | 20 +++++++--
 tests/integration/recipe/test_recipe.py |  5 +++
 tests/unit/recipe/test_recipe.py        | 22 ---------
 tests/unit/test_dataset.py              | 59 ++++++++++++++++++++-----
 5 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index e34c7fc52b..397be02596 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -598,23 +598,6 @@ def _allow_skipping(dataset: Dataset) -> bool:
     )
 
 
-def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
-    """Set the 'version' facet based on derivation input datasets."""
-    versions = set()
-    for in_dataset in input_datasets:
-        in_dataset.set_version()
-        if version := in_dataset.facets.get("version"):
-            if isinstance(version, list):
-                versions.update(version)
-            else:
-                versions.add(version)
-    if versions:
-        version = versions.pop() if len(versions) == 1 else sorted(versions)
-        dataset.set_facet("version", version)
-    for supplementary_ds in dataset.supplementaries:
-        supplementary_ds.set_version()
-
-
 def _get_preprocessor_products(
     datasets: list[Dataset],
     profile: dict[str, Any],
@@ -649,7 +632,7 @@ def _get_preprocessor_products(
             else:
                 missing_vars.update(missing)
             continue
-        _set_version(dataset, input_datasets)
+        dataset.set_version()
         USED_DATASETS.append(dataset)
         _schedule_for_download(input_datasets)
         _log_input_files(input_datasets)
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 90a470a376..5da0ca8eb8 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -657,15 +657,29 @@ def minimal_facets(self) -> Facets:
         """Return a dictionary with the persistent facets."""
         return {k: v for k, v in self.facets.items() if k in self._persist}
 
+    @staticmethod
+    def _get_version(dataset: Dataset) -> str | list[str]:
+        """Get available version(s) of dataset."""
+        versions: set[str] = set()
+        for file in dataset.files:
+            if "version" in file.facets:
+                versions.add(str(file.facets["version"]))
+        return versions.pop() if len(versions) == 1 else sorted(versions)
+
     def set_version(self) -> None:
         """Set the ``'version'`` facet based on the available data."""
         versions: set[str] = set()
-        for file in self.files:
-            if "version" in file.facets:
-                versions.add(file.facets["version"])  # type: ignore
+        for input_dataset in self.input_datasets:
+            version = self._get_version(input_dataset)
+            if version:
+                if isinstance(version, list):
+                    versions.update(version)
+                else:
+                    versions.add(version)
         version = versions.pop() if len(versions) == 1 else sorted(versions)
         if version:
             self.set_facet("version", version)
+
         for supplementary_ds in self.supplementaries:
             supplementary_ds.set_version()
 
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index b127bcc3e7..8cf9384b39 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -131,6 +131,11 @@ def get_required(short_name, _):
         "get_required",
         get_required,
     )
+    monkeypatch.setattr(
+        esmvalcore.dataset,
+        "get_required",
+        get_required,
+    )
 
 
 DEFAULT_DOCUMENTATION = dedent("""
diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py
index 06299781ad..640661d089 100644
--- a/tests/unit/recipe/test_recipe.py
+++ b/tests/unit/recipe/test_recipe.py
@@ -912,28 +912,6 @@ def test_get_default_settings(mocker):
     }
 
 
-def test_set_version(mocker):
-    dataset = Dataset(short_name="tas")
-    supplementary = Dataset(short_name="areacella")
-    dataset.supplementaries = [supplementary]
-
-    input_dataset = Dataset(short_name="tas")
-    file1 = mocker.Mock()
-    file1.facets = {"version": "v1"}
-    file2 = mocker.Mock()
-    file2.facets = {"version": "v2"}
-    input_dataset.files = [file1, file2]
-
-    file3 = mocker.Mock()
-    file3.facets = {"version": "v3"}
-    supplementary.files = [file3]
-
-    _recipe._set_version(dataset, [input_dataset])
-    print(dataset)
-    assert dataset.facets["version"] == ["v1", "v2"]
-    assert dataset.supplementaries[0].facets["version"] == "v3"
-
-
 def test_extract_preprocessor_order():
     profile = {
         "custom_order": True,
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index a3c2f35359..0cea37248d 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1609,7 +1609,7 @@ def test_find_files_non_esgf_projects(mocker, project, monkeypatch):
     assert tas._file_globs == mock.sentinel.file_globs
 
 
-def test_set_version():
+def test_set_version_non_derived_var():
     dataset = Dataset(short_name="tas")
     dataset.add_supplementary(short_name="areacella")
     file_v1 = esmvalcore.local.LocalFile("/path/to/v1/tas.nc")
@@ -1625,6 +1625,53 @@ def test_set_version():
     assert dataset.supplementaries[0].facets["version"] == "v3"
 
 
+OBS6_SAT_FACETS = {
+    "project": "OBS6",
+    "dataset": "SAT",
+    "mip": "Amon",
+    "tier": 2,
+    "type": "sat",
+    "timerange": "1980/2000",
+}
+
+
+def test_set_version_derive_var(monkeypatch):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="areacella")
+    dataset.files = []
+    areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc")
+    areacella_file.facets["version"] = "v4"
+    dataset.supplementaries[0].files = [areacella_file]
+
+    def _get_input_datasets():
+        rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc")
+        rlut_file.facets["version"] = "v1"
+        rlut_dataset = Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+        )
+        rlut_dataset.files = [rlut_file]
+        rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc")
+        rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc")
+        rlutcs_file_1.facets["version"] = "v2"
+        rlutcs_file_2.facets["version"] = "v3"
+        rlutcs_dataset = Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+        )
+        rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2]
+        return [rlut_dataset, rlutcs_dataset]
+
+    monkeypatch.setattr(dataset, "_get_input_datasets", _get_input_datasets)
+
+    dataset.set_version()
+
+    assert dataset.facets["version"] == ["v1", "v2", "v3"]
+    assert dataset.supplementaries[0].facets["version"] == "v4"
+
+
 @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"])
 def test_update_timerange_from_esgf(mocker, timerange):
     esgf_files = [
@@ -2137,16 +2184,6 @@ def test_get_extra_facets_native6():
     }
 
 
-OBS6_SAT_FACETS = {
-    "project": "OBS6",
-    "dataset": "SAT",
-    "mip": "Amon",
-    "tier": 2,
-    "type": "sat",
-    "timerange": "1980/2000",
-}
-
-
 def test_derivation_necessary_no_derivation():
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
     assert dataset._derivation_necessary() is False

From f6b6d22e1c35cf368de64684c3a94e8b9b371b9d Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 10:47:42 +0200
Subject: [PATCH 28/85] Dataset._input_datasets is always list[Dataset]

---
 esmvalcore/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 5da0ca8eb8..659cc390cf 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -133,7 +133,7 @@ def __init__(self, **facets: FacetValue) -> None:
         self._session: Session | None = None
         self._files: Sequence[File] | None = None
         self._file_globs: Sequence[Path] | None = None
-        self._input_datasets: list[Dataset] | None = None
+        self._input_datasets: list[Dataset] = []
 
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
@@ -230,7 +230,7 @@ def input_datasets(self) -> list[Dataset]:
         or no files for the dataset itself are available.
 
         """
-        if self._input_datasets is not None:
+        if self._input_datasets:
             return self._input_datasets
 
         if not self._derivation_necessary():

From 1f4de86795f7c585f789ca6abb3d32a51409989c Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 11:01:23 +0200
Subject: [PATCH 29/85] Make changes fully backwards-compatible

---
 esmvalcore/dataset.py      | 20 ++++++-------
 tests/unit/test_dataset.py | 57 ++++++++++++++++++++++++++------------
 2 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 9232f00d6c..af86eb7534 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -136,16 +136,6 @@ def __init__(self, **facets: FacetValue) -> None:
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
 
-        if not self._is_derived() and self.facets.get(
-            "force_derivation",
-            False,
-        ):
-            msg = (
-                "Facet `force_derivation=True` can only be used for derived "
-                "variables (i.e., with facet `derive=True`)"
-            )
-            raise ValueError(msg)
-
     @staticmethod
     def from_recipe(
         recipe: Path | str | dict,
@@ -178,6 +168,12 @@ def _is_derived(self) -> bool:
         """Return ``True`` for derived variables, ``False`` otherwise."""
         return bool(self.facets.get("derive", False))
 
+    def _is_force_derived(self) -> bool:
+        """Return ``True`` for force-derived variables, ``False`` otherwise."""
+        return self._is_derived() and bool(
+            self.facets.get("force_derivation", False),
+        )
+
     def _derivation_necessary(self) -> bool:
         """Return ``True`` if derivation is necessary, ``False`` otherwise."""
         # If variable cannot be derived, derivation is not necessary
@@ -185,7 +181,7 @@ def _derivation_necessary(self) -> bool:
             return False
 
         # If forced derivation is requested, derivation is necessary
-        if self.facets.get("force_derivation", False):
+        if self._is_force_derived():
             return True
 
         # Otherwise, derivation is necessary of no files for the self dataset
@@ -653,7 +649,7 @@ def add_supplementary(self, **facets: FacetValue) -> None:
         """
         if self._is_derived():
             facets.setdefault("derive", False)
-        if self.facets.get("force_derivation", False):
+        if self._is_force_derived():
             facets.setdefault("force_derivation", False)
         supplementary = self.copy(**facets)
         supplementary.supplementaries = []
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 149bdb11bc..3e383f3463 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2147,6 +2147,45 @@ def test_get_extra_facets_native6():
 }
 
 
+def test_is_derived_no_derivation():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    assert dataset._is_derived() is False
+
+
+def test_is_derived_derivation():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    assert dataset._is_derived() is True
+
+
+def test_is_force_derived_no_derivation_no_force():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    assert dataset._is_force_derived() is False
+
+
+def test_is_force_derived_no_derivation_force():
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        force_derivation=True,
+    )
+    assert dataset._is_force_derived() is False
+
+
+def test_is_force_derived_derivation_no_force():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    assert dataset._is_force_derived() is False
+
+
+def test_is_force_derived_derivation_force():
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    assert dataset._is_force_derived() is True
+
+
 def test_derivation_necessary_no_derivation():
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
     assert dataset._derivation_necessary() is False
@@ -2190,24 +2229,6 @@ def test_derivation_necessary_force_derivation(tmp_path, session):
     assert dataset._derivation_necessary() is True
 
 
-def test_force_derivation_no_derived():
-    msg = (
-        r"Facet `force_derivation=True` can only be used for derived "
-        r"variables"
-    )
-
-    with pytest.raises(ValueError, match=msg):
-        Dataset(**OBS6_SAT_FACETS, short_name="tas", force_derivation=True)
-
-    with pytest.raises(ValueError, match=msg):
-        Dataset(
-            **OBS6_SAT_FACETS,
-            short_name="tas",
-            derive=False,
-            force_derivation=True,
-        )
-
-
 def test_add_supplementary_to_derived():
     dataset = Dataset(
         **OBS6_SAT_FACETS,

From c6d303bc57517d7199711fc989188339e628d7c2 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 11:30:10 +0200
Subject: [PATCH 30/85] Make Dataset.from_files work with derived variables (no
 globs yet)

---
 esmvalcore/dataset.py      |  87 +++++++++++--
 tests/unit/test_dataset.py | 254 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 324 insertions(+), 17 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 9a21d339de..c293eac3e8 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -273,12 +273,73 @@ def _file_to_dataset(
 
         return dataset
 
-    def _get_available_datasets(self) -> Iterator[Dataset]:
+    def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
         """Yield datasets based on the available files.
 
         This function requires that self.facets['mip'] is not a glob pattern.
+
+        Does take variable derivation into account, i.e., datasets available
+        through variable derivation are returned.
+
         """
-        dataset_template = self.copy()
+        datasets_found = False
+
+        # First, if no forced derivation is requested, search for datasets
+        # based on files from self
+        if not self._is_force_derived():
+            for dataset in self._get_available_datasets(self):
+                datasets_found = True
+                yield dataset
+
+        # If forced derivation is requested or no datasets based on files from
+        # self have been found, search for datasets based on files from input
+        # datasets
+        if self._is_force_derived() or not datasets_found:
+            all_datasets: list[list[tuple[dict, Dataset]]] = []
+            for input_dataset in self._get_input_datasets():
+                all_datasets.append([])
+                for expanded_ds in self._get_available_datasets(
+                    input_dataset,
+                ):
+                    updated_facets = {}
+                    for key, value in self.facets.items():
+                        if _isglob(value):
+                            if key in expanded_ds.facets and not _isglob(
+                                expanded_ds[key],
+                            ):
+                                updated_facets[key] = expanded_ds.facets[key]
+                    new_ds = self.copy()
+                    new_ds.facets.update(updated_facets)
+                    new_ds.supplementaries = expanded_ds.supplementaries
+
+                    all_datasets[-1].append((updated_facets, new_ds))
+
+            # Only consider those datasets that contain all input variables
+            # necessary for derivation
+            for updated_facets, new_ds in all_datasets[0]:
+                other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
+                if all(updated_facets in facets for facets in other_facets):
+                    yield new_ds
+                else:
+                    logger.debug(
+                        "Not all necessary input variables to derive '%s' are "
+                        "available for dataset %s",
+                        self["short_name"],
+                        updated_facets,
+                    )
+
+    def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]:
+        """Yield datasets based on the available files.
+
+        This function requires that self.facets['mip'] is not a glob pattern.
+
+        Does not take variable derivation into account, i.e., datasets
+        potentially available through variable derivation are ignored. To
+        consider derived variables properly, use the function
+        :func:`_get_all_available_datasets`.
+
+        """
+        dataset_template = dataset.copy()
         dataset_template.supplementaries = []
         if _isglob(dataset_template.facets.get("timerange")):
             # Remove wildcard `timerange` facet, because data finding cannot
@@ -289,31 +350,31 @@ def _get_available_datasets(self) -> Iterator[Dataset]:
         partially_defined = []
         expanded = False
         for file in dataset_template.files:
-            dataset = self._file_to_dataset(file)
+            new_dataset = self._file_to_dataset(file)
 
             # Filter out identical datasets
             facetset = frozenset(
                 (f, frozenset(v) if isinstance(v, list) else v)
-                for f, v in dataset.facets.items()
+                for f, v in new_dataset.facets.items()
             )
             if facetset not in seen:
                 seen.add(facetset)
                 if any(
                     _isglob(v)
-                    for f, v in dataset.facets.items()
+                    for f, v in new_dataset.facets.items()
                     if f != "timerange"
                 ):
-                    partially_defined.append((dataset, file))
+                    partially_defined.append((new_dataset, file))
                 else:
-                    dataset._update_timerange()  # noqa: SLF001
-                    dataset._supplementaries_from_files()  # noqa: SLF001
+                    new_dataset._update_timerange()  # noqa: SLF001
+                    new_dataset._supplementaries_from_files()  # noqa: SLF001
                     expanded = True
-                    yield dataset
+                    yield new_dataset
 
         # Only yield datasets with globs if there is no better alternative
-        for dataset, file in partially_defined:
+        for new_dataset, file in partially_defined:
             msg = (
-                f"{dataset} with unexpanded wildcards, created from file "
+                f"{new_dataset} with unexpanded wildcards, created from file "
                 f"{file} with facets {file.facets}. Are the missing facets "
                 "in the path to the file?"
                 if isinstance(file, local.LocalFile)
@@ -327,7 +388,7 @@ def _get_available_datasets(self) -> Iterator[Dataset]:
                     "because it still contains wildcards.",
                     msg,
                 )
-                yield dataset
+                yield new_dataset
 
     def from_files(self) -> Iterator[Dataset]:
         """Create datasets based on the available files.
@@ -378,7 +439,7 @@ def from_files(self) -> Iterator[Dataset]:
 
             for mip in mips:
                 dataset_template = self.copy(mip=mip)
-                for dataset in dataset_template._get_available_datasets():  # noqa: SLF001
+                for dataset in dataset_template._get_all_available_datasets():  # noqa: SLF001
                     expanded = True
                     yield dataset
 
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 3c2df18c4c..f28c0dbe79 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1181,6 +1181,252 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session):
     assert datasets == [expected]
 
 
+@pytest.fixture
+def lwcre_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre.touch()
+    return lwcre
+
+
+@pytest.fixture
+def rlut_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
+@pytest.fixture
+def rlutcs_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlutcs = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc",
+    )
+    rlutcs.touch()
+    return rlutcs
+
+
+def test_from_files_with_derived_no_derivation(lwcre_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_input_dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        frequency="mon",
+        long_name="TOA Longwave Cloud Radiative Effect",
+        modeling_realm=["atmos"],
+        original_short_name="lwcre",
+        standard_name="",
+        units="W m-2",
+    )
+    expected_input_dataset.supplementaries = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    expected_input_dataset.session = session
+
+    assert dataset.input_datasets == [expected_input_dataset]
+    assert expected_input_dataset.files == [lwcre_file]
+
+
+def test_from_files_with_derived(rlut_file, rlutcs_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_input_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.session = session
+
+    assert dataset.input_datasets == expected_input_datasets
+    assert expected_input_datasets[0].files == [rlut_file]
+    assert expected_input_datasets[1].files == [rlutcs_file]
+
+
+def test_from_files_with_derived_no_force_derivation(
+    lwcre_file,
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_input_dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        frequency="mon",
+        long_name="TOA Longwave Cloud Radiative Effect",
+        modeling_realm=["atmos"],
+        original_short_name="lwcre",
+        standard_name="",
+        units="W m-2",
+    )
+    expected_input_dataset.supplementaries = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    expected_input_dataset.session = session
+
+    assert dataset.input_datasets == [expected_input_dataset]
+    assert expected_input_dataset.files == [lwcre_file]
+
+
+def test_from_files_with_derived_force_derivation(
+    lwcre_file,
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_input_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.session = session
+
+    assert dataset.input_datasets == expected_input_datasets
+    assert expected_input_datasets[0].files == [rlut_file]
+    assert expected_input_datasets[1].files == [rlutcs_file]
+
+
 def test_match():
     dataset1 = Dataset(
         short_name="areacella",
@@ -2239,10 +2485,10 @@ def test_derivation_necessary_no_force_derivation(tmp_path, session):
 
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    lwcre = esmvalcore.local.LocalFile(
+    lwcre_file = esmvalcore.local.LocalFile(
         input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    lwcre.touch()
+    lwcre_file.touch()
 
     assert dataset._derivation_necessary() is False
 
@@ -2356,10 +2602,10 @@ def test_input_datasets_no_force_derivation(tmp_path, session):
 
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    lwcre = esmvalcore.local.LocalFile(
+    lwcre_file = esmvalcore.local.LocalFile(
         input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    lwcre.touch()
+    lwcre_file.touch()
 
     assert dataset.input_datasets == [dataset]
 

From 40147fb76267f3279c531034367d0dff1219c410 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 12:16:03 +0200
Subject: [PATCH 31/85] Added test for derived variable with glob

---
 tests/unit/test_dataset.py | 121 +++++++++++++++++++++++++++++++++++--
 1 file changed, 117 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index f28c0dbe79..08465f648b 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1193,6 +1193,18 @@ def lwcre_file(tmp_path):
     return lwcre
 
 
+@pytest.fixture
+def lwcre_file_ground(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre.touch()
+    return lwcre
+
+
 @pytest.fixture
 def rlut_file(tmp_path):
     input_dir = tmp_path / "Tier2" / "SAT"
@@ -1217,6 +1229,18 @@ def rlutcs_file(tmp_path):
     return rlutcs
 
 
+@pytest.fixture
+def pr_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    pr = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc",
+    )
+    pr.touch()
+    return pr
+
+
 def test_from_files_with_derived_no_derivation(lwcre_file, session):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
@@ -1259,10 +1283,89 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session):
     ]
     expected_input_dataset.session = session
 
-    assert dataset.input_datasets == [expected_input_dataset]
+    assert datasets[0].input_datasets == [expected_input_dataset]
     assert expected_input_dataset.files == [lwcre_file]
 
 
+def test_from_files_with_derived_no_derivation_glob(
+    lwcre_file,
+    lwcre_file_ground,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS_GLOB, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+        ),
+        Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True),
+    ]
+    for expected_ds in expected_datasets:
+        expected_ds.add_supplementary(short_name="pr", type="sat")
+        expected_ds.session = session
+
+    assert datasets == expected_datasets
+    assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].files == [lwcre_file]
+
+    expected_input_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    for dataset, expected in zip(
+        datasets,
+        expected_input_datasets,
+        strict=True,
+    ):
+        assert dataset.input_datasets == [expected]
+    assert expected_input_datasets[0].files == [lwcre_file_ground]
+    assert expected_input_datasets[1].files == [lwcre_file]
+
+
 def test_from_files_with_derived(rlut_file, rlutcs_file, session):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
@@ -1306,7 +1409,7 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
     for expected_ds in expected_input_datasets:
         expected_ds.session = session
 
-    assert dataset.input_datasets == expected_input_datasets
+    assert datasets[0].input_datasets == expected_input_datasets
     assert expected_input_datasets[0].files == [rlut_file]
     assert expected_input_datasets[1].files == [rlutcs_file]
 
@@ -1358,7 +1461,7 @@ def test_from_files_with_derived_no_force_derivation(
     ]
     expected_input_dataset.session = session
 
-    assert dataset.input_datasets == [expected_input_dataset]
+    assert datasets[0].input_datasets == [expected_input_dataset]
     assert expected_input_dataset.files == [lwcre_file]
 
 
@@ -1422,7 +1525,7 @@ def test_from_files_with_derived_force_derivation(
     for expected_ds in expected_input_datasets:
         expected_ds.session = session
 
-    assert dataset.input_datasets == expected_input_datasets
+    assert datasets[0].input_datasets == expected_input_datasets
     assert expected_input_datasets[0].files == [rlut_file]
     assert expected_input_datasets[1].files == [rlutcs_file]
 
@@ -1881,6 +1984,16 @@ def test_set_version_non_derived_var():
 }
 
 
+OBS6_SAT_FACETS_GLOB = {
+    "project": "OBS6",
+    "dataset": "SAT",
+    "mip": "Amon",
+    "tier": 2,
+    "type": "*",
+    "timerange": "1980/2000",
+}
+
+
 def test_set_version_derive_var(monkeypatch):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.add_supplementary(short_name="areacella")

From 6ec04fc41c7c17fd85ec7d7f05bcdb33e743bbf6 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 12:16:39 +0200
Subject: [PATCH 32/85] Better var name

---
 tests/unit/test_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 3e383f3463..68e8ceed05 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2202,10 +2202,10 @@ def test_derivation_necessary_no_force_derivation(tmp_path, session):
 
     input_dir = tmp_path / "Tier2" / "SAT"
     input_dir.mkdir(parents=True, exist_ok=True)
-    lwcre = esmvalcore.local.LocalFile(
+    lwcre_file = esmvalcore.local.LocalFile(
         input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
     )
-    lwcre.touch()
+    lwcre_file.touch()
 
     assert dataset._derivation_necessary() is False
 

From ea3386e4b062fcb7ce75ee5fe8bbd2324b4aa97d Mon Sep 17 00:00:00 2001
From: Manuel Schlund <32543114+schlunma@users.noreply.github.com>
Date: Tue, 15 Jul 2025 13:12:54 +0200
Subject: [PATCH 33/85] Update esmvalcore/dataset.py

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 esmvalcore/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index af86eb7534..64dd892096 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -571,7 +571,7 @@ def supplementary_summary(dataset: Dataset) -> str:
 
         return txt
 
-    def __getitem__(self, key: Any) -> FacetValue:
+    def __getitem__(self, key: str) -> FacetValue:
         """Get a facet value."""
         return self.facets[key]
 

From efa2ac1b59ee7d44dddbe4083e3147f35a2a2928 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 13:41:16 +0200
Subject: [PATCH 34/85] Add further tests for Dataset.from_files with globs

---
 esmvalcore/dataset.py      |   3 +-
 tests/unit/test_dataset.py | 265 +++++++++++++++++++++++++++++++++++--
 2 files changed, 256 insertions(+), 12 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index f9934d0f23..b53445bbf0 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -323,8 +323,9 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
                 else:
                     logger.debug(
                         "Not all necessary input variables to derive '%s' are "
-                        "available for dataset %s",
+                        "available for %s with facets %s",
                         self["short_name"],
+                        new_ds.summary(shorten=True),
                         updated_facets,
                     )
 
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 08465f648b..a3467bdb6a 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1,3 +1,4 @@
+import logging
 import textwrap
 from collections import defaultdict
 from pathlib import Path
@@ -1217,6 +1218,18 @@ def rlut_file(tmp_path):
     return rlut
 
 
+@pytest.fixture
+def rlut_file_ground(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
 @pytest.fixture
 def rlutcs_file(tmp_path):
     input_dir = tmp_path / "Tier2" / "SAT"
@@ -1294,7 +1307,11 @@ def test_from_files_with_derived_no_derivation_glob(
     session,
 ):
     """Test `from_files` with derived variable and supplementary."""
-    dataset = Dataset(**OBS6_SAT_FACETS_GLOB, short_name="lwcre", derive=True)
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
     dataset.add_supplementary(short_name="pr")
     dataset.session = session
 
@@ -1314,7 +1331,9 @@ def test_from_files_with_derived_no_derivation_glob(
 
     assert datasets == expected_datasets
     assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[0].supplementaries[0].files == [pr_file]
     assert datasets[1].files == [lwcre_file]
+    assert datasets[1].supplementaries[0].files == [pr_file]
 
     expected_input_datasets = [
         Dataset(
@@ -1414,6 +1433,74 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
     assert expected_input_datasets[1].files == [rlutcs_file]
 
 
+def test_from_files_with_derived_glob(
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+    caplog,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    with caplog.at_level(logging.DEBUG):
+        datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_input_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.session = session
+
+    assert datasets[0].input_datasets == expected_input_datasets
+    assert expected_input_datasets[0].files == [rlut_file]
+    assert expected_input_datasets[1].files == [rlutcs_file]
+
+    log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
+    msg = (
+        "Not all necessary input variables to derive 'lwcre' are available "
+        "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets "
+        "{'type': 'ground'}"
+    )
+    assert msg in log_debugs
+
+
 def test_from_files_with_derived_no_force_derivation(
     lwcre_file,
     rlut_file,
@@ -1465,6 +1552,94 @@ def test_from_files_with_derived_no_force_derivation(
     assert expected_input_dataset.files == [lwcre_file]
 
 
+def test_from_files_with_derived_no_force_derivation_glob(
+    lwcre_file,
+    lwcre_file_ground,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+        ),
+        Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True),
+    ]
+    for expected_ds in expected_datasets:
+        expected_ds.add_supplementary(short_name="pr", type="sat")
+        expected_ds.session = session
+
+    assert datasets == expected_datasets
+    assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+    assert datasets[1].files == [lwcre_file]
+    assert datasets[1].supplementaries[0].files == [pr_file]
+
+    expected_input_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    for dataset, expected in zip(
+        datasets,
+        expected_input_datasets,
+        strict=True,
+    ):
+        assert dataset.input_datasets == [expected]
+    assert expected_input_datasets[0].files == [lwcre_file_ground]
+    assert expected_input_datasets[1].files == [lwcre_file]
+
+
 def test_from_files_with_derived_force_derivation(
     lwcre_file,
     rlut_file,
@@ -1530,6 +1705,84 @@ def test_from_files_with_derived_force_derivation(
     assert expected_input_datasets[1].files == [rlutcs_file]
 
 
+def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
+    lwcre_file,
+    lwcre_file_ground,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+    caplog,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*"},
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    with caplog.at_level(logging.DEBUG):
+        datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_input_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.session = session
+
+    assert datasets[0].input_datasets == expected_input_datasets
+    assert expected_input_datasets[0].files == [rlut_file]
+    assert expected_input_datasets[1].files == [rlutcs_file]
+
+    log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
+    msg = (
+        "Not all necessary input variables to derive 'lwcre' are available "
+        "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets "
+        "{'type': 'ground'}"
+    )
+    assert msg in log_debugs
+
+
 def test_match():
     dataset1 = Dataset(
         short_name="areacella",
@@ -1984,16 +2237,6 @@ def test_set_version_non_derived_var():
 }
 
 
-OBS6_SAT_FACETS_GLOB = {
-    "project": "OBS6",
-    "dataset": "SAT",
-    "mip": "Amon",
-    "tier": 2,
-    "type": "*",
-    "timerange": "1980/2000",
-}
-
-
 def test_set_version_derive_var(monkeypatch):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.add_supplementary(short_name="areacella")

From f9c47a940540abe0b691d30e3927164ce5523c3f Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 14:09:38 +0200
Subject: [PATCH 35/85] Update _dataset_from_files to new Dataset.from_files

---
 esmvalcore/_recipe/recipe.py      |  3 +-
 esmvalcore/_recipe/to_datasets.py | 92 +++++++++----------------------
 2 files changed, 27 insertions(+), 68 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 397be02596..8d3af3ce7d 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -51,7 +51,6 @@
 from . import check
 from .from_datasets import datasets_to_recipe
 from .to_datasets import (
-    _derive_needed,
     _get_input_datasets,
     _representative_datasets,
 )
@@ -231,7 +230,7 @@ def _get_default_settings(dataset: Dataset) -> dict[str, Any]:
 
     settings = {}
 
-    if _derive_needed(dataset):
+    if dataset._derivation_necessary():  # noqa: SLF001 (will be replaced soon)
         settings["derive"] = {
             "short_name": facets["short_name"],
             "standard_name": facets["standard_name"],
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 7cd17bdbb0..458db07498 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -428,9 +428,7 @@ def datasets_from_recipe(
     return datasets
 
 
-def _dataset_from_files(  # noqa: C901
-    dataset: Dataset,
-) -> list[Dataset]:
+def _dataset_from_files(dataset: Dataset) -> list[Dataset]:
     """Replace facet values of '*' based on available files."""
     result: list[Dataset] = []
     errors: list[str] = []
@@ -441,53 +439,32 @@ def _dataset_from_files(  # noqa: C901
             dataset.summary(shorten=True),
         )
 
-    representative_datasets = _representative_datasets(dataset)
-
-    # For derived variables, representative_datasets might contain more than
-    # one element
-    all_datasets: list[list[tuple[dict, Dataset]]] = []
-    for representative_dataset in representative_datasets:
-        all_datasets.append([])
-        for expanded_ds in representative_dataset.from_files():
-            updated_facets = {}
-            unexpanded_globs = {}
-            for key, value in dataset.facets.items():
-                if _isglob(value):
-                    if key in expanded_ds.facets and not _isglob(
-                        expanded_ds[key],
-                    ):
-                        updated_facets[key] = expanded_ds.facets[key]
-                    else:
-                        unexpanded_globs[key] = value
-
-            if unexpanded_globs:
-                msg = _report_unexpanded_globs(
-                    dataset,
-                    expanded_ds,
-                    unexpanded_globs,
-                )
-                errors.append(msg)
-                continue
+    for expanded_ds in dataset.from_files():
+        updated_facets = {}
+        unexpanded_globs = {}
+        for key, value in dataset.facets.items():
+            if _isglob(value):
+                if key in expanded_ds.facets and not _isglob(
+                    expanded_ds[key],
+                ):
+                    updated_facets[key] = expanded_ds.facets[key]
+                else:
+                    unexpanded_globs[key] = value
+
+        if unexpanded_globs:
+            msg = _report_unexpanded_globs(
+                dataset,
+                expanded_ds,
+                unexpanded_globs,
+            )
+            errors.append(msg)
+            continue
 
-            new_ds = dataset.copy()
-            new_ds.facets.update(updated_facets)
-            new_ds.supplementaries = expanded_ds.supplementaries
+        new_ds = dataset.copy()
+        new_ds.facets.update(updated_facets)
+        new_ds.supplementaries = expanded_ds.supplementaries
 
-            all_datasets[-1].append((updated_facets, new_ds))
-
-    # If globs have been expanded, only consider those datasets that contain
-    # all necessary input variables if derivation is necessary
-    for updated_facets, new_ds in all_datasets[0]:
-        other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
-        if all(updated_facets in facets for facets in other_facets):
-            result.append(new_ds)
-        else:
-            logger.debug(
-                "Not all necessary input variables to derive '%s' are "
-                "available for dataset %s",
-                dataset["short_name"],
-                updated_facets,
-            )
+        result.append(new_ds)
 
     if errors:
         raise RecipeError("\n".join(errors))
@@ -538,27 +515,10 @@ def _report_unexpanded_globs(
     return msg
 
 
-def _derive_needed(dataset: Dataset) -> bool:
-    """Check if dataset needs to be derived from other datasets."""
-    if not dataset.facets.get("derive"):
-        return False
-    if dataset.facets.get("force_derivation"):
-        return True
-    if _isglob(dataset.facets.get("timerange", "")):
-        # Our file finding routines are not able to handle globs.
-        dataset = dataset.copy()
-        dataset.facets.pop("timerange")
-
-    copy = dataset.copy()
-    copy.supplementaries = []
-    return not copy.files
-
-
 def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
     """Determine the input datasets needed for deriving `dataset`."""
     facets = dataset.facets
-    if not _derive_needed(dataset):
-        _fix_cmip5_fx_ensemble(dataset)
+    if not dataset._derivation_necessary():  # noqa: SLF001
         return [dataset]
 
     # Configure input datasets needed to derive variable

From 3de7bc85ca1ffe0fe0fc5dfe157f971e72beda0b Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 14:10:18 +0200
Subject: [PATCH 36/85] Move _fix_cmip5_fx_ensemble to
 _get_preprocessor_products

---
 esmvalcore/_recipe/recipe.py          | 23 +++++++++++++++++++++++
 esmvalcore/_recipe/to_datasets.py     | 23 -----------------------
 tests/unit/recipe/test_recipe.py      | 20 ++++++++++++++++++++
 tests/unit/recipe/test_to_datasets.py | 20 --------------------
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 8d3af3ce7d..700e205e81 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -597,6 +597,28 @@ def _allow_skipping(dataset: Dataset) -> bool:
     )
 
 
+def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
+    """Automatically correct the wrong ensemble for CMIP5 fx variables."""
+    if (
+        dataset.facets.get("project") == "CMIP5"
+        and dataset.facets.get("mip") == "fx"
+        and dataset.facets.get("ensemble") != "r0i0p0"
+        and not dataset.files
+    ):
+        original_ensemble = dataset["ensemble"]
+        copy = dataset.copy()
+        copy.facets["ensemble"] = "r0i0p0"
+        if copy.files:
+            dataset.facets["ensemble"] = "r0i0p0"
+            logger.info(
+                "Corrected wrong 'ensemble' from '%s' to '%s' for %s",
+                original_ensemble,
+                dataset["ensemble"],
+                dataset.summary(shorten=True),
+            )
+            dataset.find_files()
+
+
 def _get_preprocessor_products(
     datasets: list[Dataset],
     profile: dict[str, Any],
@@ -620,6 +642,7 @@ def _get_preprocessor_products(
         settings = _get_default_settings(dataset)
         _apply_preprocessor_profile(settings, profile)
         _update_multi_dataset_settings(dataset.facets, settings)
+        _fix_cmip5_fx_ensemble(dataset)
         _update_preproc_functions(settings, dataset, datasets, missing_vars)
         _add_dataset_specific_settings(dataset, settings)
         check.preprocessor_supplementaries(dataset, settings)
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 458db07498..f6bf57fbe2 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -188,28 +188,6 @@ def _merge_supplementary_dicts(
     return list(merged.values())
 
 
-def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
-    """Automatically correct the wrong ensemble for CMIP5 fx variables."""
-    if (
-        dataset.facets.get("project") == "CMIP5"
-        and dataset.facets.get("mip") == "fx"
-        and dataset.facets.get("ensemble") != "r0i0p0"
-        and not dataset.files
-    ):
-        original_ensemble = dataset["ensemble"]
-        copy = dataset.copy()
-        copy.facets["ensemble"] = "r0i0p0"
-        if copy.files:
-            dataset.facets["ensemble"] = "r0i0p0"
-            logger.info(
-                "Corrected wrong 'ensemble' from '%s' to '%s' for %s",
-                original_ensemble,
-                dataset["ensemble"],
-                dataset.summary(shorten=True),
-            )
-            dataset.find_files()
-
-
 def _get_supplementary_short_names(
     facets: Facets,
     step: str,
@@ -534,7 +512,6 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
         }
         input_dataset.facets.update(input_facets)
         input_dataset.augment_facets()
-        _fix_cmip5_fx_ensemble(input_dataset)
         if input_facets.get("optional") and not input_dataset.files:
             logger.info(
                 "Skipping: no data found for %s which is marked as 'optional'",
diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py
index 640661d089..d7c6c90178 100644
--- a/tests/unit/recipe/test_recipe.py
+++ b/tests/unit/recipe/test_recipe.py
@@ -965,3 +965,23 @@ def test_update_extract_shape_rel_shapefile(shapefile, session, tmp_path):
             / "ar6.shp"
         )
         assert settings["extract_shape"]["shapefile"] == ar6_file
+
+
+def test_fix_cmip5_fx_ensemble(monkeypatch):
+    def find_files(self):
+        if self.facets["ensemble"] == "r0i0p0":
+            self._files = ["file1.nc"]
+
+    monkeypatch.setattr(Dataset, "find_files", find_files)
+
+    dataset = Dataset(
+        dataset="dataset1",
+        short_name="orog",
+        mip="fx",
+        project="CMIP5",
+        ensemble="r1i1p1",
+    )
+
+    _recipe._fix_cmip5_fx_ensemble(dataset)
+
+    assert dataset["ensemble"] == "r0i0p0"
diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py
index 20439a1d07..f9d01881ca 100644
--- a/tests/unit/recipe/test_to_datasets.py
+++ b/tests/unit/recipe/test_to_datasets.py
@@ -347,26 +347,6 @@ def from_files(_):
         to_datasets._dataset_from_files(dataset)
 
 
-def test_fix_cmip5_fx_ensemble(monkeypatch):
-    def find_files(self):
-        if self.facets["ensemble"] == "r0i0p0":
-            self._files = ["file1.nc"]
-
-    monkeypatch.setattr(Dataset, "find_files", find_files)
-
-    dataset = Dataset(
-        dataset="dataset1",
-        short_name="orog",
-        mip="fx",
-        project="CMIP5",
-        ensemble="r1i1p1",
-    )
-
-    to_datasets._fix_cmip5_fx_ensemble(dataset)
-
-    assert dataset["ensemble"] == "r0i0p0"
-
-
 def test_get_supplementary_short_names(monkeypatch):
     def _update_cmor_facets(facets):
         facets["modeling_realm"] = "atmos"

From 77fd1e85eb03422817ec87046bfaa63d3dc17209 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 15:30:12 +0200
Subject: [PATCH 37/85] Make _derivation_necessary work with timerange globs

---
 esmvalcore/dataset.py      | 6 ++++++
 tests/unit/test_dataset.py | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index b53445bbf0..ea35d040e0 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -190,6 +190,12 @@ def _derivation_necessary(self) -> bool:
         # are found
         ds_copy = self.copy()
         ds_copy.supplementaries = []
+
+        # Avoid potential errors from missing data during timerange glob
+        # expansion
+        if _isglob(ds_copy.facets.get("timerange", "")):
+            ds_copy.facets.pop("timerange", None)
+
         return not ds_copy.files
 
     def _get_input_datasets(self) -> list[Dataset]:
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index a3467bdb6a..23a43336fa 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2835,6 +2835,15 @@ def test_derivation_necessary_no_force_derivation_no_files():
     assert dataset._derivation_necessary() is True
 
 
+def test_derivation_necessary_no_force_derivation_no_files_glob():
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    assert dataset._derivation_necessary() is True
+
+
 def test_derivation_necessary_no_force_derivation(tmp_path, session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.session = session

From 312fafa35935b39bce38717afd6537358ca22dca Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 16:24:41 +0200
Subject: [PATCH 38/85] Fix bug for non-derived variables

---
 esmvalcore/dataset.py      |  8 ++++++--
 tests/unit/test_dataset.py | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index ea35d040e0..39e2d84494 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -290,13 +290,17 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
         """
         datasets_found = False
 
-        # First, if no forced derivation is requested, search for datasets
-        # based on files from self
+        # If no forced derivation is requested, search for datasets based on
+        # files from self
         if not self._is_force_derived():
             for dataset in self._get_available_datasets(self):
                 datasets_found = True
                 yield dataset
 
+        # For variables that cannot be derived, we are done here
+        if not self._is_derived():
+            return
+
         # If forced derivation is requested or no datasets based on files from
         # self have been found, search for datasets based on files from input
         # datasets
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 23a43336fa..6ad6d78d97 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1182,6 +1182,22 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session):
     assert datasets == [expected]
 
 
+def test_from_files_no_files_glob(session):
+    dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas")
+    datasets = list(dataset.from_files())
+    assert datasets == [dataset]
+
+
+def test_from_files_derived_no_files_glob(session):
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    datasets = list(dataset.from_files())
+    assert datasets == [dataset]
+
+
 @pytest.fixture
 def lwcre_file(tmp_path):
     input_dir = tmp_path / "Tier2" / "SAT"

From e8c7bf2a56f3bd9d706ef0667379654069c6c7c1 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 16:25:28 +0200
Subject: [PATCH 39/85] Use new Dataset.from_files in code

---
 esmvalcore/_recipe/check.py             | 21 ++++++++++------
 esmvalcore/_recipe/to_datasets.py       | 33 ++++++-------------------
 tests/integration/recipe/test_check.py  | 32 +++++++++++-------------
 tests/integration/recipe/test_recipe.py | 20 +++++++--------
 4 files changed, 46 insertions(+), 60 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index e79db4f2ee..fdebefb6a1 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -36,7 +36,7 @@
 
     from esmvalcore._task import TaskSet
     from esmvalcore.dataset import Dataset
-    from esmvalcore.typing import Facets, FacetValue
+    from esmvalcore.typing import FacetValue
 
 
 logger = logging.getLogger(__name__)
@@ -467,15 +467,22 @@ def valid_time_selection(timerange: str) -> None:
 
 
 def differing_timeranges(
-    timeranges: set[FacetValue],
-    required_vars: list[Facets],
+    var_to_derive: FacetValue,
+    input_datasets: list[Dataset],
 ) -> None:
-    """Log error if required variables have differing timeranges."""
+    """Log error if input datasets have differing timeranges."""
+    timeranges: set[FacetValue] = set()
+    for input_dataset in input_datasets:
+        if "timerange" in input_dataset.facets:
+            timeranges.add(input_dataset.facets["timerange"])
     if len(timeranges) > 1:
+        input_datasets_str = "; ".join(
+            d.summary(shorten=True) for d in input_datasets
+        )
         msg = (
-            f"Differing timeranges with values {timeranges} "
-            f"found for required variables {required_vars}. "
-            "Set `timerange` to a common value."
+            f"Differing timeranges with values {timeranges} found for "
+            f"datasets {input_datasets_str} necessary to derive "
+            f"'{var_to_derive}'. Set `timerange` to a common value."
         )
         raise ValueError(msg)
 
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index f6bf57fbe2..a5bf3d6427 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -13,7 +13,6 @@
 from esmvalcore.esgf.facets import FACETS
 from esmvalcore.exceptions import RecipeError
 from esmvalcore.local import LocalFile, _replace_years_with_timerange
-from esmvalcore.preprocessor._derive import get_required
 from esmvalcore.preprocessor._io import DATASET_KEYS
 from esmvalcore.preprocessor._supplementary_vars import (
     PREPROCESSOR_SUPPLEMENTARIES,
@@ -495,39 +494,23 @@ def _report_unexpanded_globs(
 
 def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
     """Determine the input datasets needed for deriving `dataset`."""
-    facets = dataset.facets
     if not dataset._derivation_necessary():  # noqa: SLF001
-        return [dataset]
+        return dataset.input_datasets
 
-    # Configure input datasets needed to derive variable
-    datasets = []
-    required_vars = get_required(facets["short_name"], facets["project"])
-    # idea: add option to specify facets in list of dicts that is value of
-    # 'derive' in the recipe and use that instead of get_required?
-    for input_facets in required_vars:
-        input_dataset = dataset.copy()
-        keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets}
-        input_dataset.facets = {
-            k: v for k, v in input_dataset.facets.items() if k in keep
-        }
-        input_dataset.facets.update(input_facets)
-        input_dataset.augment_facets()
-        if input_facets.get("optional") and not input_dataset.files:
+    # Skip optional datasets if no data is available
+    input_datasets: list[Dataset] = []
+    for input_dataset in dataset.input_datasets:
+        if input_dataset.facets.get("optional") and not input_dataset.files:
             logger.info(
                 "Skipping: no data found for %s which is marked as 'optional'",
                 input_dataset,
             )
         else:
-            datasets.append(input_dataset)
+            input_datasets.append(input_dataset)
 
-    # Check timeranges of available input data.
-    timeranges: set[FacetValue] = set()
-    for input_dataset in datasets:
-        if "timerange" in input_dataset.facets:
-            timeranges.add(input_dataset.facets["timerange"])
-    check.differing_timeranges(timeranges, required_vars)
+    check.differing_timeranges(dataset.facets["short_name"], input_datasets)
 
-    return datasets
+    return input_datasets
 
 
 def _representative_datasets(dataset: Dataset) -> list[Dataset]:
diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py
index 551603a446..bfa2097c30 100644
--- a/tests/integration/recipe/test_check.py
+++ b/tests/integration/recipe/test_check.py
@@ -274,25 +274,23 @@ def test_valid_time_selection_rejections(timerange, message):
     assert str(rec_err.value) == message
 
 
-def test_differing_timeranges(caplog):
-    timeranges = set()
-    timeranges.add("1950/1951")
-    timeranges.add("1950/1952")
-    required_variables = [
-        {"short_name": "rsdscs", "timerange": "1950/1951"},
-        {"short_name": "rsuscs", "timerange": "1950/1952"},
+def test_differing_timeranges():
+    facets = {
+        "project": "OBS6",
+        "dataset": "SAT",
+        "mip": "Amon",
+        "tier": 2,
+        "type": "sat",
+    }
+    input_datasets = [
+        Dataset(**facets, short_name="rlut", timerange="1950/1952"),
+        Dataset(**facets, short_name="rlutcs", timerange="1951/1953"),
+        Dataset(**facets, short_name="rlut"),
     ]
-    with pytest.raises(ValueError) as exc:
-        check.differing_timeranges(timeranges, required_variables)
-    expected_log = (
-        f"Differing timeranges with values {timeranges} "
-        "found for required variables "
-        "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, "
-        "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. "
-        "Set `timerange` to a common value."
-    )
 
-    assert expected_log in str(exc.value)
+    msg = r"Differing timeranges with values {'1950/1952', '1951/1953'}"
+    with pytest.raises(ValueError, match=msg):
+        check.differing_timeranges("lwcre", input_datasets)
 
 
 def test_data_availability_nonexistent(tmp_path):
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index 8cf9384b39..7ab85581cb 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -126,11 +126,6 @@ def get_required(short_name, _):
             {"short_name": "areacella", "mip": "fx", "optional": True},
         ]
 
-    monkeypatch.setattr(
-        esmvalcore._recipe.to_datasets,
-        "get_required",
-        get_required,
-    )
     monkeypatch.setattr(
         esmvalcore.dataset,
         "get_required",
@@ -2543,9 +2538,7 @@ def test_representative_dataset_derived_var(
     expected_facets = {
         # Already present in variable
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": force_derivation,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2555,6 +2548,9 @@ def test_representative_dataset_derived_var(
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_input_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     if force_derivation:
         expected_datasets = [
@@ -2609,9 +2605,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "short_name": "rsdscs",
         # Already present in variables
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": True,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2624,6 +2618,9 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_input_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     rsdscs = Dataset(**rsdscs_facets)
     rsdscs.session = session
@@ -2633,9 +2630,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "short_name": "rsuscs",
         # Already present in variables
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": True,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2648,6 +2643,9 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_input_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     rsuscs = Dataset(**rsuscs_facets)
     rsuscs.session = session

From 6cdd7141d9a9865a3dc3b1561abc7877d8fca6aa Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 16:39:13 +0200
Subject: [PATCH 40/85] Added test to check differing timeranges

---
 tests/integration/recipe/test_check.py |  2 +-
 tests/unit/recipe/test_to_datasets.py  | 52 ++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py
index bfa2097c30..2c2f1ea745 100644
--- a/tests/integration/recipe/test_check.py
+++ b/tests/integration/recipe/test_check.py
@@ -288,7 +288,7 @@ def test_differing_timeranges():
         Dataset(**facets, short_name="rlut"),
     ]
 
-    msg = r"Differing timeranges with values {'1950/1952', '1951/1953'}"
+    msg = r"Differing timeranges with values"
     with pytest.raises(ValueError, match=msg):
         check.differing_timeranges("lwcre", input_datasets)
 
diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py
index f9d01881ca..2e560765f1 100644
--- a/tests/unit/recipe/test_to_datasets.py
+++ b/tests/unit/recipe/test_to_datasets.py
@@ -1,3 +1,4 @@
+import logging
 import textwrap
 from pathlib import Path
 
@@ -302,6 +303,57 @@ def test_get_input_datasets_derive(session):
     assert rlns["frequency"] == "1hr"
 
 
+def test_get_input_datasets_derive_optional(caplog, tmp_path, session):
+    facets = {
+        "project": "OBS6",
+        "dataset": "SAT",
+        "mip": "SImon",
+        "short_name": "siextent",
+        "tier": 2,
+        "type": "sat",
+        "timerange": "1980/2000",
+        "derive": True,
+    }
+
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    sic_file = LocalFile(
+        input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc",
+    )
+    sic_file.touch()
+
+    dataset = Dataset(**facets)
+    dataset.files = []
+    dataset.session = session
+
+    with caplog.at_level(logging.INFO):
+        datasets = to_datasets._get_input_datasets(dataset)
+
+    expected = Dataset(
+        dataset="SAT",
+        project="OBS6",
+        mip="SImon",
+        short_name="siconca",
+        derive=False,
+        frequency="mon",
+        long_name="Sea-Ice Area Percentage (Atmospheric Grid)",
+        modeling_realm=["seaIce"],
+        optional="true",
+        original_short_name="siconca",
+        standard_name="sea_ice_area_fraction",
+        tier=2,
+        timerange="1980/2000",
+        type="sat",
+        units="%",
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+
+    logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"]
+    assert "which is marked as 'optional'" in logger_infos[-1]
+
+
 def test_max_years(session):
     recipe_txt = textwrap.dedent("""
     diagnostics:

From 9057cf9d115e3978b949a9b740f046628db3f3e9 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 17:35:59 +0200
Subject: [PATCH 41/85] Make everything work with glob in timerange

---
 esmvalcore/dataset.py      | 29 +++++++++++------------
 tests/unit/test_dataset.py | 47 +++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 39e2d84494..fc99e9c9e3 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -243,41 +243,42 @@ def input_datasets(self) -> list[Dataset]:
         self._input_datasets = input_datasets
         return input_datasets
 
+    @staticmethod
     def _file_to_dataset(
-        self,
+        dataset: Dataset,
         file: esgf.ESGFFile | local.LocalFile,
     ) -> Dataset:
         """Create a dataset from a file with a `facets` attribute."""
         facets = dict(file.facets)
-        if "version" not in self.facets:
+        if "version" not in dataset.facets:
             # Remove version facet if no specific version requested
             facets.pop("version", None)
 
         updated_facets = {
             f: v
             for f, v in facets.items()
-            if f in self.facets
-            and _isglob(self.facets[f])
-            and _ismatch(v, self.facets[f])
+            if f in dataset.facets
+            and _isglob(dataset.facets[f])
+            and _ismatch(v, dataset.facets[f])
         }
-        dataset = self.copy()
-        dataset.facets.update(updated_facets)
+        new_dataset = dataset.copy()
+        new_dataset.facets.update(updated_facets)
 
         # If possible, remove unexpanded facets that can be automatically
         # populated.
-        unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)}
+        unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)}
         required_for_augment = {"project", "mip", "short_name", "dataset"}
         if unexpanded and not unexpanded & required_for_augment:
-            copy = dataset.copy()
+            copy = new_dataset.copy()
             copy.supplementaries = []
             for facet in unexpanded:
                 copy.facets.pop(facet)
             copy.augment_facets()
             for facet in unexpanded:
                 if facet in copy.facets:
-                    dataset.facets.pop(facet)
+                    new_dataset.facets.pop(facet)
 
-        return dataset
+        return new_dataset
 
     def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
         """Yield datasets based on the available files.
@@ -320,7 +321,7 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
                                 updated_facets[key] = expanded_ds.facets[key]
                     new_ds = self.copy()
                     new_ds.facets.update(updated_facets)
-                    new_ds.supplementaries = expanded_ds.supplementaries
+                    new_ds.supplementaries = self.supplementaries
 
                     all_datasets[-1].append((updated_facets, new_ds))
 
@@ -361,7 +362,7 @@ def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]:
         partially_defined = []
         expanded = False
         for file in dataset_template.files:
-            new_dataset = self._file_to_dataset(file)
+            new_dataset = self._file_to_dataset(dataset, file)
 
             # Filter out identical datasets
             facetset = frozenset(
@@ -378,7 +379,6 @@ def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]:
                     partially_defined.append((new_dataset, file))
                 else:
                     new_dataset._update_timerange()  # noqa: SLF001
-                    new_dataset._supplementaries_from_files()  # noqa: SLF001
                     expanded = True
                     yield new_dataset
 
@@ -451,6 +451,7 @@ def from_files(self) -> Iterator[Dataset]:
             for mip in mips:
                 dataset_template = self.copy(mip=mip)
                 for dataset in dataset_template._get_all_available_datasets():  # noqa: SLF001
+                    dataset._supplementaries_from_files()  # noqa: SLF001
                     expanded = True
                     yield dataset
 
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 6ad6d78d97..08e24414a2 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1188,9 +1188,10 @@ def test_from_files_no_files_glob(session):
     assert datasets == [dataset]
 
 
-def test_from_files_derived_no_files_glob(session):
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_files_glob(timerange, session):
     dataset = Dataset(
-        **{**OBS6_SAT_FACETS, "type": "*"},
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
         short_name="lwcre",
         derive=True,
     )
@@ -1316,7 +1317,9 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session):
     assert expected_input_dataset.files == [lwcre_file]
 
 
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
 def test_from_files_with_derived_no_derivation_glob(
+    timerange,
     lwcre_file,
     lwcre_file_ground,
     pr_file,
@@ -1324,7 +1327,7 @@ def test_from_files_with_derived_no_derivation_glob(
 ):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(
-        **{**OBS6_SAT_FACETS, "type": "*"},
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
         short_name="lwcre",
         derive=True,
     )
@@ -1449,7 +1452,9 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
     assert expected_input_datasets[1].files == [rlutcs_file]
 
 
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
 def test_from_files_with_derived_glob(
+    timerange,
     rlut_file,
     rlut_file_ground,
     rlutcs_file,
@@ -1459,7 +1464,7 @@ def test_from_files_with_derived_glob(
 ):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(
-        **{**OBS6_SAT_FACETS, "type": "*"},
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
         short_name="lwcre",
         derive=True,
     )
@@ -1509,12 +1514,12 @@ def test_from_files_with_derived_glob(
     assert expected_input_datasets[1].files == [rlutcs_file]
 
     log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
-    msg = (
-        "Not all necessary input variables to derive 'lwcre' are available "
-        "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets "
-        "{'type': 'ground'}"
-    )
-    assert msg in log_debugs
+    msg = "Not all necessary input variables to derive 'lwcre' are available"
+    for log_debug in log_debugs:
+        if msg in log_debug:
+            break
+    else:
+        pytest.fail(f"No debug message '{msg}'")
 
 
 def test_from_files_with_derived_no_force_derivation(
@@ -1568,7 +1573,9 @@ def test_from_files_with_derived_no_force_derivation(
     assert expected_input_dataset.files == [lwcre_file]
 
 
-def test_from_files_with_derived_no_force_derivation_glob(
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
+    timerange,
     lwcre_file,
     lwcre_file_ground,
     rlut_file,
@@ -1579,7 +1586,7 @@ def test_from_files_with_derived_no_force_derivation_glob(
 ):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(
-        **{**OBS6_SAT_FACETS, "type": "*"},
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
         short_name="lwcre",
         derive=True,
     )
@@ -1721,7 +1728,9 @@ def test_from_files_with_derived_force_derivation(
     assert expected_input_datasets[1].files == [rlutcs_file]
 
 
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
 def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
+    timerange,
     lwcre_file,
     lwcre_file_ground,
     rlut_file,
@@ -1733,7 +1742,7 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
 ):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(
-        **{**OBS6_SAT_FACETS, "type": "*"},
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
         short_name="lwcre",
         derive=True,
         force_derivation=True,
@@ -1791,12 +1800,12 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     assert expected_input_datasets[1].files == [rlutcs_file]
 
     log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
-    msg = (
-        "Not all necessary input variables to derive 'lwcre' are available "
-        "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets "
-        "{'type': 'ground'}"
-    )
-    assert msg in log_debugs
+    msg = "Not all necessary input variables to derive 'lwcre' are available"
+    for log_debug in log_debugs:
+        if msg in log_debug:
+            break
+    else:
+        pytest.fail(f"No debug message '{msg}'")
 
 
 def test_match():

From ebc82ba42f1e83e4800ebb0b7d6ebcf6f8ac094e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 17:44:36 +0200
Subject: [PATCH 42/85] Differing timeranges are caught by
 _get_all_available_datasets

---
 esmvalcore/_recipe/check.py            | 22 ----------------------
 esmvalcore/_recipe/to_datasets.py      |  2 --
 tests/integration/recipe/test_check.py | 19 -------------------
 3 files changed, 43 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index fdebefb6a1..aafd4a0e3a 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -36,7 +36,6 @@
 
     from esmvalcore._task import TaskSet
     from esmvalcore.dataset import Dataset
-    from esmvalcore.typing import FacetValue
 
 
 logger = logging.getLogger(__name__)
@@ -466,27 +465,6 @@ def valid_time_selection(timerange: str) -> None:
             _check_timerange_values(date, timerange_list)
 
 
-def differing_timeranges(
-    var_to_derive: FacetValue,
-    input_datasets: list[Dataset],
-) -> None:
-    """Log error if input datasets have differing timeranges."""
-    timeranges: set[FacetValue] = set()
-    for input_dataset in input_datasets:
-        if "timerange" in input_dataset.facets:
-            timeranges.add(input_dataset.facets["timerange"])
-    if len(timeranges) > 1:
-        input_datasets_str = "; ".join(
-            d.summary(shorten=True) for d in input_datasets
-        )
-        msg = (
-            f"Differing timeranges with values {timeranges} found for "
-            f"datasets {input_datasets_str} necessary to derive "
-            f"'{var_to_derive}'. Set `timerange` to a common value."
-        )
-        raise ValueError(msg)
-
-
 def _check_literal(
     settings: dict,
     *,
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index a5bf3d6427..7619044107 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -508,8 +508,6 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
         else:
             input_datasets.append(input_dataset)
 
-    check.differing_timeranges(dataset.facets["short_name"], input_datasets)
-
     return input_datasets
 
 
diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py
index 2c2f1ea745..6aec456f80 100644
--- a/tests/integration/recipe/test_check.py
+++ b/tests/integration/recipe/test_check.py
@@ -274,25 +274,6 @@ def test_valid_time_selection_rejections(timerange, message):
     assert str(rec_err.value) == message
 
 
-def test_differing_timeranges():
-    facets = {
-        "project": "OBS6",
-        "dataset": "SAT",
-        "mip": "Amon",
-        "tier": 2,
-        "type": "sat",
-    }
-    input_datasets = [
-        Dataset(**facets, short_name="rlut", timerange="1950/1952"),
-        Dataset(**facets, short_name="rlutcs", timerange="1951/1953"),
-        Dataset(**facets, short_name="rlut"),
-    ]
-
-    msg = r"Differing timeranges with values"
-    with pytest.raises(ValueError, match=msg):
-        check.differing_timeranges("lwcre", input_datasets)
-
-
 def test_data_availability_nonexistent(tmp_path):
     var = {
         "dataset": "ABC",

From 58dd66645e33d1cbba99672b4eba1cff98a757e7 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <32543114+schlunma@users.noreply.github.com>
Date: Tue, 15 Jul 2025 18:54:25 +0200
Subject: [PATCH 43/85] Use ABCs and other type hint suggestions from
 @bouweandela

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 doc/develop/derivation.rst                    |  2 +-
 esmvalcore/_recipe/recipe.py                  | 22 +++++++++----------
 esmvalcore/_recipe/to_datasets.py             |  4 ++--
 esmvalcore/local.py                           |  2 +-
 esmvalcore/preprocessor/__init__.py           |  6 ++---
 esmvalcore/preprocessor/_derive/__init__.py   |  2 +-
 esmvalcore/preprocessor/_derive/_baseclass.py |  2 +-
 esmvalcore/preprocessor/_derive/ohc.py        |  2 +-
 esmvalcore/preprocessor/_derive/vegfrac.py    |  2 +-
 9 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/doc/develop/derivation.rst b/doc/develop/derivation.rst
index c8516d8414..44ff9b8e2b 100644
--- a/doc/develop/derivation.rst
+++ b/doc/develop/derivation.rst
@@ -26,7 +26,7 @@ A typical example looks like this:
        """Derivation of variable `dummy`."""
 
        @staticmethod
-       def required(project: FacetValue) -> list[Facets]:
+       def required(project: str) -> list[Facets]:
            """Declare the variables needed for derivation."""
            mip = "fx"
            if project == "CMIP6":
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index e34c7fc52b..2992c7156e 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -104,7 +104,7 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str:
 
 def _update_target_levels(
     dataset: Dataset,
-    datasets: list[Dataset],
+    datasets: Sequence[Dataset],
     settings: dict[str, Any],
 ) -> None:
     """Replace the target levels dataset name with a filename if needed."""
@@ -142,7 +142,7 @@ def _update_target_levels(
 
 def _update_target_grid(
     dataset: Dataset,
-    datasets: list[Dataset],
+    datasets: Sequence[Dataset],
     settings: dict[str, Any],
 ) -> None:
     """Replace the target grid dataset name with a filename if needed."""
@@ -178,7 +178,7 @@ def _update_regrid_time(dataset: Dataset, settings: dict) -> None:
         settings["regrid_time"]["frequency"] = dataset.facets["frequency"]
 
 
-def _select_dataset(dataset_name: str, datasets: list[Dataset]) -> Dataset:
+def _select_dataset(dataset_name: str, datasets: Sequence[Dataset]) -> Dataset:
     for dataset in datasets:
         if dataset.facets["dataset"] == dataset_name:
             return dataset
@@ -192,7 +192,7 @@ def _select_dataset(dataset_name: str, datasets: list[Dataset]) -> Dataset:
 
 
 def _limit_datasets(
-    datasets: list[Dataset],
+    datasets: Sequence[Dataset],
     profile: dict[str, Any],
 ) -> list[Dataset]:
     """Try to limit the number of datasets to max_datasets."""
@@ -321,7 +321,7 @@ def _add_to_download_list(dataset: Dataset) -> None:
             dataset.files[i] = file.local_file(dataset.session["download_dir"])
 
 
-def _schedule_for_download(datasets: list[Dataset]) -> None:
+def _schedule_for_download(datasets: Iterable[Dataset]) -> None:
     """Schedule files for download."""
     for dataset in datasets:
         _add_to_download_list(dataset)
@@ -618,7 +618,7 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
 def _get_preprocessor_products(
     datasets: list[Dataset],
     profile: dict[str, Any],
-    order: tuple[str, ...],
+    order: Sequence[str, ...],
     name: str,
 ) -> set[PreprocessorFile]:
     """Get preprocessor product definitions for a set of datasets.
@@ -935,7 +935,7 @@ def _log_recipe_errors(self, exc: RecipeError) -> None:
             )
 
     @staticmethod
-    def _need_ncl(raw_diagnostics: dict[str, Any]) -> bool:
+    def _need_ncl(raw_diagnostics: dict[str, dict[str, Any]]) -> bool:
         if not raw_diagnostics:
             return False
         for diagnostic in raw_diagnostics.values():
@@ -989,8 +989,8 @@ def _initialize_diagnostics(
     def _initialize_scripts(
         self,
         diagnostic_name: str,
-        raw_scripts: dict[str, Any],
-        variable_names: tuple[str, Any],
+        raw_scripts: dict[str, dict[str, Any]],
+        variable_names: Sequence[str],
     ) -> dict[str, Any]:
         """Define script in diagnostic."""
         if not raw_scripts:
@@ -1042,7 +1042,7 @@ def _initialize_scripts(
 
     def _resolve_diagnostic_ancestors(
         self,
-        tasks: Iterable[PreprocessingTask],
+        tasks: Iterable[BaseTask],
     ) -> None:
         """Resolve diagnostic ancestors."""
         tasks = {t.name: t for t in tasks}
@@ -1117,7 +1117,7 @@ def _update_with_ancestors(self, tasknames_to_run: set[str]) -> bool:
     def _create_diagnostic_tasks(
         self,
         diagnostic_name: str,
-        diagnostic: dict[str, Any],
+        diagnostic: Diagnostic,
         tasknames_to_run: set[str],
     ) -> list[BaseTask]:
         """Create diagnostic tasks."""
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 7cd17bdbb0..c25d4026ad 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -52,7 +52,7 @@ def _facet_to_str(facet_value: FacetValue | None) -> str:
     return str(facet_value)
 
 
-def _set_alias(variables: list[list[Dataset]]) -> None:
+def _set_alias(variables: Sequence[Sequence[Dataset]]) -> None:
     """Add unique alias for datasets.
 
     Generates a unique alias for each dataset that will be shared by all
@@ -584,7 +584,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
             datasets.append(input_dataset)
 
     # Check timeranges of available input data.
-    timeranges: set[FacetValue] = set()
+    timeranges: set[str] = set()
     for input_dataset in datasets:
         if "timerange" in input_dataset.facets:
             timeranges.add(input_dataset.facets["timerange"])
diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index 9b30df924b..d58d68a9b1 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -324,7 +324,7 @@ def _truncate_dates(date: str, file_date: str) -> tuple[int, int]:
 
 
 def _select_files(
-    filenames: list[LocalFile],
+    filenames: Iterable[LocalFile],
     timerange: FacetValue,
 ) -> list[LocalFile]:
     """Select files containing data between a given timerange.
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 3f275fb439..3147639ae0 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -364,7 +364,7 @@ def _run_preproc_function(
     function: Callable,
     items: Any,
     kwargs: Any,
-    input_files: list[File] | None = None,
+    input_files: Sequence[File] | None = None,
 ) -> Any:
     """Run preprocessor function."""
     kwargs_str = ",\n".join(
@@ -426,7 +426,7 @@ def _run_preproc_function(
 
 
 def preprocess(
-    items: list[PreprocessorFile | Cube | str | Path],
+    items: Sequence[PreprocessorFile | Cube | str | Path],
     step: str,
     input_files: list[File] | None = None,
     output_file: Path | None = None,
@@ -484,7 +484,7 @@ def preprocess(
 
 def get_step_blocks(
     steps: Iterable[str],
-    order: list[str],
+    order: Sequence[str],
 ) -> list[list[str]]:
     """Group steps into execution blocks."""
     blocks: list[list[str]] = []
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index cd209f88de..a27a809a21 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -41,7 +41,7 @@ def _get_all_derived_variables() -> dict[str, type[DerivedVariableBase]]:
 __all__ = list(ALL_DERIVED_VARIABLES)
 
 
-def get_required(short_name: FacetValue, project: FacetValue) -> list[Facets]:
+def get_required(short_name: str, project: str) -> list[Facets]:
     """Return all required variables for derivation.
 
     Get all information (at least ``short_name``) required for derivation.
diff --git a/esmvalcore/preprocessor/_derive/_baseclass.py b/esmvalcore/preprocessor/_derive/_baseclass.py
index 4e71f66dd6..b050921801 100644
--- a/esmvalcore/preprocessor/_derive/_baseclass.py
+++ b/esmvalcore/preprocessor/_derive/_baseclass.py
@@ -12,7 +12,7 @@ class DerivedVariableBase:
 
     @staticmethod
     @abstractmethod
-    def required(project: FacetValue) -> list[Facets]:
+    def required(project: str) -> list[Facets]:
         """Return required variables for derivation.
 
         This method needs to be overridden in the child class belonging to the
diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py
index 87643d2d1b..1bd58c337f 100644
--- a/esmvalcore/preprocessor/_derive/ohc.py
+++ b/esmvalcore/preprocessor/_derive/ohc.py
@@ -16,7 +16,7 @@ class DerivedVariable(DerivedVariableBase):
     """Derivation of variable `ohc`."""
 
     @staticmethod
-    def required(project: FacetValue) -> list[Facets]:
+    def required(project: str) -> list[Facets]:
         """Declare the variables needed for derivation."""
         volcello: Facets = {"short_name": "volcello", "mip": "fx"}
         if project == "CMIP5":
diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py
index edd4dce75d..007ab406f9 100644
--- a/esmvalcore/preprocessor/_derive/vegfrac.py
+++ b/esmvalcore/preprocessor/_derive/vegfrac.py
@@ -15,7 +15,7 @@ class DerivedVariable(DerivedVariableBase):
     """Derivation of variable `vegFrac`."""
 
     @staticmethod
-    def required(project: FacetValue) -> list[Facets]:
+    def required(project: str) -> list[Facets]:
         """Declare the variables needed for derivation."""
         sftlf: Facets = {"short_name": "sftlf", "mip": "fx"}
         if project == "CMIP5":

From b1c66fdbf5ce3e2cbbc52489cf74ac7830f268b8 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:00:18 +0200
Subject: [PATCH 44/85] Ruff fixes

---
 esmvalcore/_recipe/recipe.py                  |  4 +-
 esmvalcore/_recipe/to_datasets.py             |  2 +-
 esmvalcore/preprocessor/__init__.py           | 44 ++++++++++---------
 esmvalcore/preprocessor/_derive/__init__.py   |  2 +-
 esmvalcore/preprocessor/_derive/_baseclass.py |  2 +-
 esmvalcore/preprocessor/_derive/ohc.py        |  2 +-
 esmvalcore/preprocessor/_derive/vegfrac.py    |  2 +-
 7 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 2992c7156e..09cb8d850c 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -57,7 +57,7 @@
 )
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Iterable, Sequence
 
     from esmvalcore.config import Session
     from esmvalcore.typing import Facets
@@ -66,6 +66,8 @@
 
 PreprocessorSettings = dict[str, Any]
 
+Diagnostic = dict[str, Any]
+
 DOWNLOAD_FILES = set()
 """Use a global variable to keep track of files that need to be downloaded."""
 
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index c25d4026ad..d37b5a271f 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import logging
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterable, Iterator, Sequence
 from copy import deepcopy
 from numbers import Number
 from typing import TYPE_CHECKING, Any
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 3147639ae0..40b835b3ed 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -15,8 +15,7 @@
 from esmvalcore._task import BaseTask
 from esmvalcore.cmor.check import cmor_check_data, cmor_check_metadata
 from esmvalcore.cmor.fix import fix_data, fix_file, fix_metadata
-
-from ._area import (
+from esmvalcore.preprocessor._area import (
     area_statistics,
     extract_named_regions,
     extract_region,
@@ -24,20 +23,20 @@
     meridional_statistics,
     zonal_statistics,
 )
-from ._compare_with_refs import bias, distance_metric
-from ._concatenate import concatenate
-from ._cycles import amplitude
-from ._dask_progress import _compute_with_progress
-from ._derive import derive
-from ._detrend import detrend
-from ._io import (
+from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric
+from esmvalcore.preprocessor._concatenate import concatenate
+from esmvalcore.preprocessor._cycles import amplitude
+from esmvalcore.preprocessor._dask_progress import _compute_with_progress
+from esmvalcore.preprocessor._derive import derive
+from esmvalcore.preprocessor._detrend import detrend
+from esmvalcore.preprocessor._io import (
     _get_debug_filename,
     _sort_products,
     load,
     save,
     write_metadata,
 )
-from ._mask import (
+from esmvalcore.preprocessor._mask import (
     mask_above_threshold,
     mask_below_threshold,
     mask_fillvalues,
@@ -48,21 +47,24 @@
     mask_multimodel,
     mask_outside_range,
 )
-from ._multimodel import ensemble_statistics, multi_model_statistics
-from ._other import clip, cumulative_sum, histogram
-from ._regrid import (
+from esmvalcore.preprocessor._multimodel import (
+    ensemble_statistics,
+    multi_model_statistics,
+)
+from esmvalcore.preprocessor._other import clip, cumulative_sum, histogram
+from esmvalcore.preprocessor._regrid import (
     extract_coordinate_points,
     extract_levels,
     extract_location,
     extract_point,
     regrid,
 )
-from ._rolling_window import rolling_window_statistics
-from ._supplementary_vars import (
+from esmvalcore.preprocessor._rolling_window import rolling_window_statistics
+from esmvalcore.preprocessor._supplementary_vars import (
     add_supplementary_variables,
     remove_supplementary_variables,
 )
-from ._time import (
+from esmvalcore.preprocessor._time import (
     annual_statistics,
     anomalies,
     climate_statistics,
@@ -81,9 +83,9 @@
     seasonal_statistics,
     timeseries_filter,
 )
-from ._trend import linear_trend, linear_trend_stderr
-from ._units import accumulate_coordinate, convert_units
-from ._volume import (
+from esmvalcore.preprocessor._trend import linear_trend, linear_trend_stderr
+from esmvalcore.preprocessor._units import accumulate_coordinate, convert_units
+from esmvalcore.preprocessor._volume import (
     axis_statistics,
     depth_integration,
     extract_surface_from_atm,
@@ -92,10 +94,10 @@
     extract_volume,
     volume_statistics,
 )
-from ._weighting import weighting_landsea_fraction
+from esmvalcore.preprocessor._weighting import weighting_landsea_fraction
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Iterable
+    from collections.abc import Callable, Iterable, Sequence
 
     from dask.delayed import Delayed
 
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index a27a809a21..d22eeb3a20 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -10,7 +10,7 @@
 
 from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase
 from esmvalcore.preprocessor._units import convert_units
-from esmvalcore.typing import Facets, FacetValue
+from esmvalcore.typing import Facets
 
 logger = logging.getLogger(__name__)
 
diff --git a/esmvalcore/preprocessor/_derive/_baseclass.py b/esmvalcore/preprocessor/_derive/_baseclass.py
index b050921801..b8d8bc27da 100644
--- a/esmvalcore/preprocessor/_derive/_baseclass.py
+++ b/esmvalcore/preprocessor/_derive/_baseclass.py
@@ -4,7 +4,7 @@
 
 from iris.cube import Cube, CubeList
 
-from esmvalcore.typing import Facets, FacetValue
+from esmvalcore.typing import Facets
 
 
 class DerivedVariableBase:
diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py
index 1bd58c337f..d9105ffe52 100644
--- a/esmvalcore/preprocessor/_derive/ohc.py
+++ b/esmvalcore/preprocessor/_derive/ohc.py
@@ -5,7 +5,7 @@
 from iris import Constraint
 from iris.cube import Cube, CubeList
 
-from esmvalcore.typing import Facets, FacetValue
+from esmvalcore.typing import Facets
 
 from ._baseclass import DerivedVariableBase
 
diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py
index 007ab406f9..419ae3878c 100644
--- a/esmvalcore/preprocessor/_derive/vegfrac.py
+++ b/esmvalcore/preprocessor/_derive/vegfrac.py
@@ -6,7 +6,7 @@
 from iris.cube import Cube, CubeList
 
 from esmvalcore.preprocessor._regrid import regrid
-from esmvalcore.typing import Facets, FacetValue
+from esmvalcore.typing import Facets
 
 from ._baseclass import DerivedVariableBase
 

From 6be3169358f63a354a88b56b10fb82207ada2621 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:05:17 +0200
Subject: [PATCH 45/85] Use type aliases

---
 esmvalcore/_recipe/recipe.py | 64 ++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 09cb8d850c..f1c99dfe6e 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -64,9 +64,12 @@
 
 logger = logging.getLogger(__name__)
 
+Diagnostic = dict[str, Any]
+
 PreprocessorSettings = dict[str, Any]
 
-Diagnostic = dict[str, Any]
+PreprocessorProfile = dict[str, dict[str, Any]]
+
 
 DOWNLOAD_FILES = set()
 """Use a global variable to keep track of files that need to be downloaded."""
@@ -107,7 +110,7 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str:
 def _update_target_levels(
     dataset: Dataset,
     datasets: Sequence[Dataset],
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
 ) -> None:
     """Replace the target levels dataset name with a filename if needed."""
     levels = settings.get("extract_levels", {}).get("levels")
@@ -145,7 +148,7 @@ def _update_target_levels(
 def _update_target_grid(
     dataset: Dataset,
     datasets: Sequence[Dataset],
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
 ) -> None:
     """Replace the target grid dataset name with a filename if needed."""
     grid = settings.get("regrid", {}).get("target_grid")
@@ -195,7 +198,7 @@ def _select_dataset(dataset_name: str, datasets: Sequence[Dataset]) -> Dataset:
 
 def _limit_datasets(
     datasets: Sequence[Dataset],
-    profile: dict[str, Any],
+    profile: PreprocessorProfile,
 ) -> list[Dataset]:
     """Try to limit the number of datasets to max_datasets."""
     max_datasets = datasets[0].session["max_datasets"]
@@ -226,7 +229,7 @@ def _limit_datasets(
     return limited
 
 
-def _get_default_settings(dataset: Dataset) -> dict[str, Any]:
+def _get_default_settings(dataset: Dataset) -> PreprocessorSettings:
     """Get default preprocessor settings."""
     session = dataset.session
     facets = dataset.facets
@@ -257,7 +260,7 @@ def _get_default_settings(dataset: Dataset) -> dict[str, Any]:
 
 def _add_dataset_specific_settings(
     dataset: Dataset,
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
 ) -> None:
     """Add dataset-specific settings."""
     project = dataset.facets["project"]
@@ -287,7 +290,7 @@ def _add_dataset_specific_settings(
 
 
 def _exclude_dataset(
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
     facets: Facets,
     step: str,
 ) -> None:
@@ -306,7 +309,7 @@ def _exclude_dataset(
 
 
 def _update_weighting_settings(
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
     facets: Facets,
 ) -> None:
     """Update settings for the weighting preprocessors."""
@@ -377,8 +380,8 @@ def _check_input_files(input_datasets: Iterable[Dataset]) -> set[str]:
 
 
 def _apply_preprocessor_profile(
-    settings: dict[str, Any],
-    profile_settings: dict[str, Any],
+    settings: PreprocessorSettings,
+    profile_settings: PreprocessorProfile,
 ) -> None:
     """Apply settings from preprocessor profile."""
     profile_settings = deepcopy(profile_settings)
@@ -396,7 +399,7 @@ def _apply_preprocessor_profile(
 
 def _get_common_attributes(
     products: set[PreprocessorFile],
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
 ) -> dict[str, Any]:
     """Get common attributes for the output products."""
     attributes: dict[str, Any] = {}
@@ -455,7 +458,7 @@ def _get_downstream_settings(
     step: str,
     order: tuple[str, ...],
     products: set[PreprocessorFile],
-) -> dict[str, Any]:
+) -> PreprocessorSettings:
     """Get downstream preprocessor settings shared between products."""
     settings = {}
     remaining_steps = order[order.index(step) + 1 :]
@@ -471,7 +474,7 @@ def _get_downstream_settings(
 
 def _update_multi_dataset_settings(
     facets: Facets,
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
 ) -> None:
     """Configure multi dataset statistics."""
     for step in MULTI_MODEL_FUNCTIONS:
@@ -500,7 +503,7 @@ def _update_multiproduct(
     order: tuple[str, ...],
     preproc_dir: Path,
     step: str,
-) -> tuple[set[PreprocessorFile], dict[str, Any]]:
+) -> tuple[set[PreprocessorFile], PreprocessorSettings]:
     """Return new products that are aggregated over multiple datasets.
 
     These new products will replace the original products at runtime.
@@ -526,7 +529,7 @@ def _update_multiproduct(
 
     downstream_settings = _get_downstream_settings(step, order, multiproducts)
 
-    relevant_settings: dict[str, Any] = {
+    relevant_settings: PreprocessorSettings = {
         "output_products": defaultdict(dict),
     }  # pass to ancestors
 
@@ -570,7 +573,7 @@ def _update_multiproduct(
 def update_ancestors(
     ancestors: set[PreprocessorFile],
     step: str,
-    downstream_settings: dict[str, Any],
+    downstream_settings: PreprocessorSettings,
 ) -> None:
     """Retroactively add settings to ancestor products."""
     for product in ancestors:
@@ -580,7 +583,10 @@ def update_ancestors(
                 settings[key] = value
 
 
-def _update_extract_shape(settings: dict[str, Any], session: Session) -> None:
+def _update_extract_shape(
+    settings: PreprocessorSettings,
+    session: Session,
+) -> None:
     if "extract_shape" in settings:
         shapefile = settings["extract_shape"].get("shapefile")
         if shapefile:
@@ -619,8 +625,8 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
 
 def _get_preprocessor_products(
     datasets: list[Dataset],
-    profile: dict[str, Any],
-    order: Sequence[str, ...],
+    profile: PreprocessorProfile,
+    order: Sequence[str],
     name: str,
 ) -> set[PreprocessorFile]:
     """Get preprocessor product definitions for a set of datasets.
@@ -764,7 +770,7 @@ def _set_start_end_year(product: PreprocessorFile) -> None:
 
 
 def _update_preproc_functions(
-    settings: dict[str, Any],
+    settings: PreprocessorSettings,
     dataset: Dataset,
     datasets: list[Dataset],
     missing_vars: set[str],
@@ -800,7 +806,7 @@ def _update_preproc_functions(
 
 def _get_preprocessor_task(
     datasets: list[Dataset],
-    profiles: dict[str, Any],
+    profiles: PreprocessorProfile,
     task_name: str,
 ) -> PreprocessingTask:
     """Create preprocessor task(s) for a set of datasets."""
@@ -852,7 +858,9 @@ def _get_preprocessor_task(
     return task
 
 
-def _extract_preprocessor_order(profile: dict[str, Any]) -> tuple[str, ...]:
+def _extract_preprocessor_order(
+    profile: PreprocessorProfile,
+) -> tuple[str, ...]:
     """Extract the order of the preprocessing steps from the profile."""
     custom_order = profile.pop("custom_order", False)
     if not custom_order:
@@ -937,7 +945,7 @@ def _log_recipe_errors(self, exc: RecipeError) -> None:
             )
 
     @staticmethod
-    def _need_ncl(raw_diagnostics: dict[str, dict[str, Any]]) -> bool:
+    def _need_ncl(raw_diagnostics: Diagnostic) -> bool:
         if not raw_diagnostics:
             return False
         for diagnostic in raw_diagnostics.values():
@@ -960,8 +968,8 @@ def _initialize_provenance(self, raw_documentation: dict[str, Any]):
 
     def _initialize_diagnostics(
         self,
-        raw_diagnostics: dict[str, Any],
-    ) -> dict[str, Any]:
+        raw_diagnostics: Diagnostic,
+    ) -> Diagnostic:
         """Define diagnostics in recipe."""
         logger.debug("Retrieving diagnostics from recipe")
         check.diagnostics(raw_diagnostics)
@@ -969,7 +977,7 @@ def _initialize_diagnostics(
         diagnostics = {}
 
         for name, raw_diagnostic in raw_diagnostics.items():
-            diagnostic: dict[str, Any] = {}
+            diagnostic: Diagnostic = {}
             diagnostic["name"] = name
             diagnostic["datasets"] = [
                 ds for ds in self.datasets if ds.facets["diagnostic"] == name
@@ -1155,7 +1163,7 @@ def _create_diagnostic_tasks(
     def _create_preprocessor_tasks(
         self,
         diagnostic_name: str,
-        diagnostic: dict[str, Any],
+        diagnostic: Diagnostic,
         tasknames_to_run: set[str],
         any_diag_script_is_run: bool,
     ) -> tuple[list[BaseTask], list[RecipeError]]:
@@ -1321,7 +1329,7 @@ def get_output(self) -> dict[str, Any]:
 
         Returns
         -------
-        product_filenames : dict
+        dict
             Lists of products/attributes grouped by task.
         """
         output: dict[str, Any] = {}

From 5744b0d9b4a1774f6203881263a925253f742e66 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <32543114+schlunma@users.noreply.github.com>
Date: Tue, 15 Jul 2025 19:09:19 +0200
Subject: [PATCH 46/85] Do not change minimal facets

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 esmvalcore/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 64dd892096..dc211b6668 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -577,7 +577,7 @@ def __getitem__(self, key: str) -> FacetValue:
 
     def __setitem__(self, key: str, value: FacetValue) -> None:
         """Set a facet value."""
-        self.set_facet(key, value, persist=False)
+        self.facets[key] = value
 
     def set_facet(
         self,

From cbcf37b4c8e4d07fa26fd3e588783c22f99a9086 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:16:53 +0200
Subject: [PATCH 47/85] Used more type aliases

---
 esmvalcore/preprocessor/__init__.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 40b835b3ed..5ded4bbbd9 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -364,10 +364,10 @@ def _get_multi_model_settings(
 
 def _run_preproc_function(
     function: Callable,
-    items: Any,
+    items: PreprocessorItem | Iterable[PreprocessorItem],
     kwargs: Any,
     input_files: Sequence[File] | None = None,
-) -> Any:
+) -> PreprocessorItem | Iterable[PreprocessorItem]:
     """Run preprocessor function."""
     kwargs_str = ",\n".join(
         [f"{k} = {pformat(v)}" for (k, v) in kwargs.items()],
@@ -428,13 +428,13 @@ def _run_preproc_function(
 
 
 def preprocess(
-    items: Sequence[PreprocessorFile | Cube | str | Path],
+    items: Sequence[PreprocessorItem],
     step: str,
     input_files: list[File] | None = None,
     output_file: Path | None = None,
     debug: bool = False,
     **settings: Any,
-) -> list[PreprocessorFile | Cube | str | Path]:
+) -> list[PreprocessorItem]:
     """Run preprocessor."""
     logger.debug("Running preprocessor step %s", step)
     function = globals()[step]
@@ -656,6 +656,9 @@ def group(self, keys: list) -> str:
         return "_".join(identifier)
 
 
+PreprocessorItem = PreprocessorFile | Cube | str | Path
+
+
 def _apply_multimodel(
     products: set[PreprocessorFile],
     step: str,

From 14e8b5ed602dab8996f79c995a779ef103254fd6 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <32543114+schlunma@users.noreply.github.com>
Date: Tue, 15 Jul 2025 19:19:37 +0200
Subject: [PATCH 48/85] Fix typo in func name

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 tests/integration/recipe/test_recipe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index b127bcc3e7..5bd6ad47dc 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -1543,7 +1543,7 @@ def test_diagnostic_task_provenance(
     assert os.path.exists(prefix + ".xml")
 
 
-def test_invalid_diagnostcic_ancestor(
+def test_invalid_diagnostic_ancestor(
     tmp_path,
     patched_datafinder,
     session,

From ecbecc6e3fec487238daeae25e3d65c09f0e37fd Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:32:05 +0200
Subject: [PATCH 49/85] Make mypy happy

---
 esmvalcore/_recipe/check.py                 | 4 ++--
 esmvalcore/_recipe/recipe.py                | 4 ++--
 esmvalcore/_recipe/to_datasets.py           | 4 ++--
 esmvalcore/local.py                         | 4 ++--
 esmvalcore/preprocessor/__init__.py         | 8 ++++----
 esmvalcore/preprocessor/_derive/__init__.py | 1 -
 esmvalcore/preprocessor/_derive/qep.py      | 4 ++--
 7 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index e79db4f2ee..a33868da74 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -36,7 +36,7 @@
 
     from esmvalcore._task import TaskSet
     from esmvalcore.dataset import Dataset
-    from esmvalcore.typing import Facets, FacetValue
+    from esmvalcore.typing import Facets
 
 
 logger = logging.getLogger(__name__)
@@ -467,7 +467,7 @@ def valid_time_selection(timerange: str) -> None:
 
 
 def differing_timeranges(
-    timeranges: set[FacetValue],
+    timeranges: set[str],
     required_vars: list[Facets],
 ) -> None:
     """Log error if required variables have differing timeranges."""
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index f1c99dfe6e..392b130708 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -203,7 +203,7 @@ def _limit_datasets(
     """Try to limit the number of datasets to max_datasets."""
     max_datasets = datasets[0].session["max_datasets"]
     if not max_datasets:
-        return datasets
+        return list(datasets)
 
     logger.info("Limiting the number of datasets to %s", max_datasets)
 
@@ -626,7 +626,7 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
 def _get_preprocessor_products(
     datasets: list[Dataset],
     profile: PreprocessorProfile,
-    order: Sequence[str],
+    order: tuple[str, ...],
     name: str,
 ) -> set[PreprocessorFile]:
     """Get preprocessor product definitions for a set of datasets.
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index d37b5a271f..7aab83719b 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -563,7 +563,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
 
     # Configure input datasets needed to derive variable
     datasets = []
-    required_vars = get_required(facets["short_name"], facets["project"])
+    required_vars = get_required(facets["short_name"], facets["project"])  # type: ignore
     # idea: add option to specify facets in list of dicts that is value of
     # 'derive' in the recipe and use that instead of get_required?
     for input_facets in required_vars:
@@ -587,7 +587,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
     timeranges: set[str] = set()
     for input_dataset in datasets:
         if "timerange" in input_dataset.facets:
-            timeranges.add(input_dataset.facets["timerange"])
+            timeranges.add(input_dataset.facets["timerange"])  # type: ignore
     check.differing_timeranges(timeranges, required_vars)
 
     return datasets
diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index d58d68a9b1..221e2796f5 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -338,9 +338,9 @@ def _select_files(
     timerange = str(timerange)
     if "*" in timerange:
         # TODO: support * combined with a period
-        return filenames
+        return list(filenames)
 
-    selection = []
+    selection: list[LocalFile] = []
 
     for filename in filenames:
         start: int | str
diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index 5ded4bbbd9..f043b7ef91 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -7,7 +7,7 @@
 import logging
 from pathlib import Path
 from pprint import pformat
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, TypeAlias
 
 from iris.cube import Cube, CubeList
 
@@ -364,10 +364,10 @@ def _get_multi_model_settings(
 
 def _run_preproc_function(
     function: Callable,
-    items: PreprocessorItem | Iterable[PreprocessorItem],
+    items: PreprocessorItem | Sequence[PreprocessorItem],
     kwargs: Any,
     input_files: Sequence[File] | None = None,
-) -> PreprocessorItem | Iterable[PreprocessorItem]:
+) -> PreprocessorItem | Sequence[PreprocessorItem]:
     """Run preprocessor function."""
     kwargs_str = ",\n".join(
         [f"{k} = {pformat(v)}" for (k, v) in kwargs.items()],
@@ -656,7 +656,7 @@ def group(self, keys: list) -> str:
         return "_".join(identifier)
 
 
-PreprocessorItem = PreprocessorFile | Cube | str | Path
+PreprocessorItem: TypeAlias = PreprocessorFile | Cube | str | Path
 
 
 def _apply_multimodel(
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index d22eeb3a20..cbf138e2d7 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -59,7 +59,6 @@ def get_required(short_name: str, project: str) -> list[Facets]:
         List of facets (including at least the key ``short_name``).
 
     """
-    short_name = str(short_name)
     if short_name.lower() not in ALL_DERIVED_VARIABLES:
         msg = (
             f"Cannot derive variable '{short_name}': no derivation script "
diff --git a/esmvalcore/preprocessor/_derive/qep.py b/esmvalcore/preprocessor/_derive/qep.py
index 19d677f618..9f684cde5d 100644
--- a/esmvalcore/preprocessor/_derive/qep.py
+++ b/esmvalcore/preprocessor/_derive/qep.py
@@ -3,7 +3,7 @@
 from iris import Constraint
 from iris.cube import Cube, CubeList
 
-from esmvalcore.typing import Facets, FacetValue
+from esmvalcore.typing import Facets
 
 from ._baseclass import DerivedVariableBase
 
@@ -12,7 +12,7 @@ class DerivedVariable(DerivedVariableBase):
     """Derivation of variable `qep`."""
 
     @staticmethod
-    def required(project: FacetValue) -> list[Facets]:
+    def required(project: str) -> list[Facets]:
         """Declare the variables needed for derivation."""
         return [
             {"short_name": "evspsbl"},

From d7c73aa172b3a91d172080d60f55bb5467263531 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:42:51 +0200
Subject: [PATCH 50/85] Use type aliases in regrid.py

---
 esmvalcore/preprocessor/_regrid.py | 52 ++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py
index 62b1b27e83..84ac2a4f4e 100644
--- a/esmvalcore/preprocessor/_regrid.py
+++ b/esmvalcore/preprocessor/_regrid.py
@@ -12,7 +12,7 @@
 from copy import deepcopy
 from decimal import Decimal
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 import dask.array as da
 import iris
@@ -79,12 +79,23 @@
 _LON_MAX = 360.0
 _LON_RANGE = _LON_MAX - _LON_MIN
 
+NamedPointInterpolationScheme = Literal[
+    "linear",
+    "nearest",
+]
+
 # Supported point interpolation schemes.
 POINT_INTERPOLATION_SCHEMES = {
     "linear": Linear(extrapolation_mode="mask"),
     "nearest": Nearest(extrapolation_mode="mask"),
 }
 
+NamedHorizontalScheme = Literal[
+    "area_weighted",
+    "linear",
+    "nearest",
+]
+
 # Supported horizontal regridding schemes for regular grids (= rectilinear
 # grids; i.e., grids that can be described with 1D latitude and 1D longitude
 # coordinates orthogonal to each other)
@@ -119,8 +130,15 @@
     "nearest": UnstructuredNearest(),
 }
 
+NamedVerticalScheme = Literal[
+    "linear",
+    "nearest",
+    "linear_extrapolate",
+    "nearest_extrapolate",
+]
+
 # Supported vertical interpolation schemes.
-VERTICAL_SCHEMES: tuple[str, ...] = (
+VERTICAL_SCHEMES: tuple[NamedVerticalScheme, ...] = (
     "linear",
     "nearest",
     "linear_extrapolate",
@@ -386,7 +404,11 @@ def add_bounds_from_step(coord: Coord, step: float) -> np.ndarray:
     return cube
 
 
-def extract_location(cube: Cube, location: str, scheme: str) -> Cube:
+def extract_location(
+    cube: Cube,
+    location: str,
+    scheme: NamedPointInterpolationScheme,
+) -> Cube:
     """Extract a point using a location name, with interpolation.
 
     Extracts a single location point from a cube, according
@@ -477,7 +499,7 @@ def extract_point(
     cube: Cube,
     latitude: ArrayLike,
     longitude: ArrayLike,
-    scheme: str,
+    scheme: NamedPointInterpolationScheme,
 ) -> Cube:
     """Extract a point, with interpolation.
 
@@ -604,7 +626,11 @@ def _get_target_grid_cube(
     return target_grid_cube
 
 
-def _load_scheme(src_cube: Cube, tgt_cube: Cube, scheme: str | dict):
+def _load_scheme(
+    src_cube: Cube,
+    tgt_cube: Cube,
+    scheme: NamedHorizontalScheme | dict[str, Any],
+):
     """Return scheme that can be used in :meth:`iris.cube.Cube.regrid`."""
     loaded_scheme: Any = None
 
@@ -637,7 +663,7 @@ def _load_scheme(src_cube: Cube, tgt_cube: Cube, scheme: str | dict):
     return loaded_scheme
 
 
-def _load_generic_scheme(scheme: dict):
+def _load_generic_scheme(scheme: dict[str, Any]):
     """Load generic regridding scheme."""
     scheme = dict(scheme)  # do not overwrite original scheme
 
@@ -677,7 +703,7 @@ def _load_generic_scheme(scheme: dict):
 def _get_regridder(
     src_cube: Cube,
     tgt_cube: Cube,
-    scheme: str | dict,
+    scheme: NamedHorizontalScheme | dict,
     cache_weights: bool,
 ):
     """Get regridder to actually perform regridding.
@@ -731,7 +757,7 @@ def _get_coord_key(src_cube: Cube, tgt_cube: Cube) -> tuple[ArrayLike, ...]:
 def _get_name_and_shape_key(
     src_cube: Cube,
     tgt_cube: Cube,
-    scheme: str | dict,
+    scheme: NamedHorizontalScheme | dict,
 ) -> tuple[str, tuple[int, ...]]:
     """Get dict key from scheme name and coordinate shapes."""
     name = str(scheme)
@@ -743,7 +769,7 @@ def _get_name_and_shape_key(
 def regrid(
     cube: Cube,
     target_grid: Cube | Dataset | Path | str | dict,
-    scheme: str | dict,
+    scheme: NamedHorizontalScheme | dict,
     lat_offset: bool = True,
     lon_offset: bool = True,
     cache_weights: bool = False,
@@ -888,7 +914,7 @@ def regrid(
 
     # Load scheme and reuse existing regridder if possible
     if isinstance(scheme, str):
-        scheme = scheme.lower()
+        scheme = scheme.lower()  # type: ignore
     regridder = _get_regridder(cube, target_grid_cube, scheme, cache_weights)
 
     # Rechunk and actually perform the regridding
@@ -1181,7 +1207,7 @@ def _preserve_fx_vars(cube: iris.cube.Cube, result: iris.cube.Cube) -> None:
                 add_ancillary_variable(result, ancillary_cube)
 
 
-def parse_vertical_scheme(scheme: str) -> tuple[str, str]:
+def parse_vertical_scheme(scheme: NamedVerticalScheme) -> tuple[str, str]:
     """Parse the scheme provided for level extraction.
 
     Parameters
@@ -1224,7 +1250,7 @@ def parse_vertical_scheme(scheme: str) -> tuple[str, str]:
 def extract_levels(
     cube: iris.cube.Cube,
     levels: np.typing.ArrayLike | da.Array,
-    scheme: str,
+    scheme: NamedVerticalScheme,
     coordinate: str | None = None,
     rtol: float = 1e-7,
     atol: float | None = None,
@@ -1422,7 +1448,7 @@ def get_reference_levels(dataset: Dataset) -> list[float]:
 def extract_coordinate_points(
     cube: Cube,
     definition: dict[str, ArrayLike],
-    scheme: str,
+    scheme: NamedPointInterpolationScheme,
 ) -> Cube:
     """Extract points from any coordinate with interpolation.
 

From 69e05029bca9d467e4f6b401ef8e506e652df23e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:45:25 +0200
Subject: [PATCH 51/85] Valid return type in docstring

---
 esmvalcore/preprocessor/_regrid.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py
index 84ac2a4f4e..556949810d 100644
--- a/esmvalcore/preprocessor/_regrid.py
+++ b/esmvalcore/preprocessor/_regrid.py
@@ -342,10 +342,9 @@ def _spec_to_latlonvals(
 
     Returns
     -------
-    xvals: np.array
-        List of longitudes
-    yvals: np.array
-        List of latitudes
+    tuple[np.ndarray, np.ndarray]
+        Longitudes, Latitudes.
+
     """
     if step_latitude == 0:
         msg = f"Latitude step cannot be 0, got step_latitude={step_latitude}."

From 6eedca2cef8e2dfb157ffff3e63b51c6251043cb Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:46:20 +0200
Subject: [PATCH 52/85] Avoid Coord

---
 esmvalcore/preprocessor/_regrid.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py
index 556949810d..262283fb1f 100644
--- a/esmvalcore/preprocessor/_regrid.py
+++ b/esmvalcore/preprocessor/_regrid.py
@@ -50,7 +50,6 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable
 
-    from iris.coords import Coord
     from numpy.typing import ArrayLike
 
     from esmvalcore.dataset import Dataset
@@ -391,7 +390,10 @@ def _regional_stock_cube(spec: dict[str, Any]) -> Cube:
         circular=True,
     )
 
-    def add_bounds_from_step(coord: Coord, step: float) -> np.ndarray:
+    def add_bounds_from_step(
+        coord: iris.coords.DimCoord | iris.coords.AuxCoord,
+        step: float,
+    ) -> np.ndarray:
         """Calculate bounds from the given step."""
         bound = step / 2
         points = coord.points

From 62c1996a634373a158bcacb0d84c394a5073f557 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:49:44 +0200
Subject: [PATCH 53/85] Correct type hint

---
 esmvalcore/preprocessor/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
index f043b7ef91..86552814a5 100644
--- a/esmvalcore/preprocessor/__init__.py
+++ b/esmvalcore/preprocessor/__init__.py
@@ -568,7 +568,7 @@ def apply(self, step: str, debug: bool = False) -> None:
         )
 
     @property
-    def cubes(self) -> CubeList:
+    def cubes(self) -> list[Cube]:
         """Cubes."""
         if self._cubes is None:
             self._cubes = [ds.load() for ds in self.datasets]  # type: ignore

From 8f2f1795c1a577ee5399ceb894371eee4b3a3662 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:53:35 +0200
Subject: [PATCH 54/85] Assign new variable for new type

---
 esmvalcore/local.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index 221e2796f5..cf71097439 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -343,16 +343,12 @@ def _select_files(
     selection: list[LocalFile] = []
 
     for filename in filenames:
-        start: int | str
-        end: int | str
-        start_date: int | str
-        end_date: int | str
         start_date, end_date = _parse_period(timerange)
         start, end = _get_start_end_date(filename)
 
-        start_date, end = _truncate_dates(start_date, end)
-        end_date, start = _truncate_dates(end_date, start)
-        if start <= end_date and end >= start_date:
+        start_date_int, end_int = _truncate_dates(start_date, end)
+        end_date_int, start_int = _truncate_dates(end_date, start)
+        if start_int <= end_date_int and end_int >= start_date_int:
             selection.append(filename)
 
     return selection

From 7bc1bee8c02cea833954b01ba96c28cab0cdc3dc Mon Sep 17 00:00:00 2001
From: Manuel Schlund <32543114+schlunma@users.noreply.github.com>
Date: Tue, 15 Jul 2025 19:54:46 +0200
Subject: [PATCH 55/85] Raise error for invalid type

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 esmvalcore/local.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index cf71097439..967caa47b8 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -252,7 +252,9 @@ def _parse_period(timerange: FacetValue) -> tuple[str, str]:
     reference point in order to compute the start and end dates needed
     for file selection.
     """
-    timerange = str(timerange)
+    if not isinstance(timerange, str):
+        msg = f"`timerange` should be a `str`, got '{type(timerange)}'"
+        raise TypeError(msg)
     start_date: str | None = None
     end_date: str | None = None
     time_format = None

From 62067fca3242c2d16410d9e8d406d6c09ae4ba2f Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 19:58:47 +0200
Subject: [PATCH 56/85] Fail if invalid types given

---
 esmvalcore/local.py                   | 6 ++++--
 tests/integration/test_local.py       | 7 +++++++
 tests/unit/local/test_select_files.py | 6 ++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/local.py b/esmvalcore/local.py
index 967caa47b8..60d56d6a89 100644
--- a/esmvalcore/local.py
+++ b/esmvalcore/local.py
@@ -253,7 +253,7 @@ def _parse_period(timerange: FacetValue) -> tuple[str, str]:
     for file selection.
     """
     if not isinstance(timerange, str):
-        msg = f"`timerange` should be a `str`, got '{type(timerange)}'"
+        msg = f"`timerange` should be a `str`, got {type(timerange)}"
         raise TypeError(msg)
     start_date: str | None = None
     end_date: str | None = None
@@ -337,7 +337,9 @@ def _select_files(
     Otherwise, the file selection occurs taking into account the time
     resolution of the file.
     """
-    timerange = str(timerange)
+    if not isinstance(timerange, str):
+        msg = f"`timerange` should be a `str`, got {type(timerange)}"
+        raise TypeError(msg)
     if "*" in timerange:
         # TODO: support * combined with a period
         return list(filenames)
diff --git a/tests/integration/test_local.py b/tests/integration/test_local.py
index e2dae85dff..633d7b45da 100644
--- a/tests/integration/test_local.py
+++ b/tests/integration/test_local.py
@@ -11,6 +11,7 @@
 from esmvalcore.local import (
     LocalFile,
     _get_output_file,
+    _parse_period,
     _select_drs,
     find_files,
 )
@@ -138,3 +139,9 @@ def test_select_invalid_drs_structure():
     )
     with pytest.raises(KeyError, match=msg):
         _select_drs("input_dir", "CMIP6", "_INVALID_STRUCTURE_")
+
+
+def test_parse_period_invalid_timerange_type():
+    msg = r"`timerange` should be a `str`, got <class 'int'>"
+    with pytest.raises(TypeError, match=msg):
+        _parse_period(1)
diff --git a/tests/unit/local/test_select_files.py b/tests/unit/local/test_select_files.py
index 377d05421c..7ecc571ab2 100644
--- a/tests/unit/local/test_select_files.py
+++ b/tests/unit/local/test_select_files.py
@@ -170,3 +170,9 @@ def test_select_files_varying_format():
     assert result_yearly == files
     assert result_monthly == files[0:2]
     assert result_daily == [files[0]]
+
+
+def test_select_files_invalid_timerange_type():
+    msg = r"`timerange` should be a `str`, got <class 'int'>"
+    with pytest.raises(TypeError, match=msg):
+        _select_files([], 1)

From b12df84927837228dd0acf482898d4f9cbb4c903 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 20:01:49 +0200
Subject: [PATCH 57/85] Restore _pattern_filter

---
 esmvalcore/dataset.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index dc211b6668..bc5998d58a 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -666,16 +666,9 @@ def augment_facets(self) -> None:
             supplementary._augment_facets()  # noqa: SLF001
 
     @staticmethod
-    def _pattern_filter(
-        patterns: Iterable[FacetValue],
-        name: FacetValue,
-    ) -> list[str]:
+    def _pattern_filter(patterns: Iterable[str], name) -> list[str]:
         """Get the subset of the list `patterns` that `name` matches."""
-        return [
-            str(pat)
-            for pat in patterns
-            if fnmatch.fnmatchcase(str(name), str(pat))
-        ]
+        return [pat for pat in patterns if fnmatch.fnmatchcase(name, pat)]
 
     def _get_extra_facets(self) -> dict[str, Any]:
         """Get extra facets of dataset."""

From 22ab6e72a7404f697d0574d7471193fca7ba678e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 20:12:47 +0200
Subject: [PATCH 58/85] Better _special_name_to_dataset

---
 esmvalcore/_recipe/recipe.py     | 10 ++++++++++
 tests/unit/recipe/test_recipe.py | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 392b130708..027b2472dd 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -102,6 +102,16 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str:
                 )
             )
             raise RecipeError(msg)
+
+        if not isinstance(facets[special_name], str):
+            msg = (
+                f"Preprocessor '{facets['preprocessor']}' uses "
+                f"'{special_name}', but '{special_name}' is not a `str` for "
+                f"variable '{facets['variable_group']}' of diagnostic "
+                f"'{facets['diagnostic']}', got '{facets[special_name]}' "
+                f"({type(facets[special_name])})"
+            )
+            raise RecipeError(msg)
         special_name = str(facets[special_name])
 
     return special_name
diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py
index 06299781ad..a3678924e8 100644
--- a/tests/unit/recipe/test_recipe.py
+++ b/tests/unit/recipe/test_recipe.py
@@ -987,3 +987,19 @@ def test_update_extract_shape_rel_shapefile(shapefile, session, tmp_path):
             / "ar6.shp"
         )
         assert settings["extract_shape"]["shapefile"] == ar6_file
+
+
+def test_special_name_to_dataset_invalid_special_name_type():
+    facets = {
+        "preprocessor": "preproc",
+        "variable_group": "var",
+        "diagnostic": "diag",
+        "reference_dataset": 1,
+    }
+    msg = (
+        r"Preprocessor 'preproc' uses 'reference_dataset', but "
+        r"'reference_dataset' is not a `str` for variable 'var' of diagnostic "
+        r"'diag', got '1' \(<class 'int'>\)"
+    )
+    with pytest.raises(RecipeError, match=msg):
+        _recipe._special_name_to_dataset(facets, "reference_dataset")

From 36724efd0f77a503329e90fbeb352ac0db4bc5b3 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 20:14:18 +0200
Subject: [PATCH 59/85] Do not cast to str

---
 esmvalcore/_recipe/recipe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 027b2472dd..64b3c1c5bb 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -233,7 +233,7 @@ def _limit_datasets(
 
     logger.info(
         "Only considering %s",
-        ", ".join(str(d.facets["alias"]) for d in limited),
+        ", ".join(d.facets["alias"] for d in limited),  # type: ignore
     )
 
     return limited

From 6ad2fefaae5db5e30cf61ddd22901ac98fb386da Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 15 Jul 2025 20:18:05 +0200
Subject: [PATCH 60/85] Use int variables

---
 esmvalcore/_recipe/recipe.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 64b3c1c5bb..c749f4aff1 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -427,32 +427,31 @@ def _get_common_attributes(
         if "timerange" not in product.attributes:
             continue
         timerange = product.attributes["timerange"]
-        start: int | str
-        end: int | str
         start, end = _parse_period(timerange)
         if "timerange" not in attributes:
             attributes["timerange"] = _dates_to_timerange(start, end)
         else:
-            start_date: int | str
-            end_date: int | str
             start_date, end_date = _parse_period(attributes["timerange"])
-            start_date, start = _truncate_dates(start_date, start)
-            end_date, end = _truncate_dates(end_date, end)
+            start_date_int, start_int = _truncate_dates(start_date, start)
+            end_date_int, end_int = _truncate_dates(end_date, end)
 
             # If "span=overlap", always use the latest start_date and the
             # earliest end_date
             if span == "overlap":
-                start_date = max([start, start_date])
-                end_date = min([end, end_date])
+                start_date_int = max([start_int, start_date_int])
+                end_date_int = min([end_int, end_date_int])
 
             # If "span=full", always use the earliest start_date and the latest
             # end_date. Note: span can only take the values "overlap" or "full"
             # (this is checked earlier).
             else:
-                start_date = min([start, start_date])
-                end_date = max([end, end_date])
+                start_date_int = min([start_int, start_date_int])
+                end_date_int = max([end_int, end_date_int])
 
-            attributes["timerange"] = _dates_to_timerange(start_date, end_date)
+            attributes["timerange"] = _dates_to_timerange(
+                start_date_int,
+                end_date_int,
+            )
 
     # Ensure that attributes start_year and end_year are always available if at
     # least one of the input datasets defines it

From 74983d52ea3d6c59a3dfc81283c2939aa2f66d7f Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 16 Jul 2025 17:48:06 +0200
Subject: [PATCH 61/85] Add doc

---
 esmvalcore/dataset.py                       |  9 ++++--
 esmvalcore/preprocessor/_derive/__init__.py | 36 +++++++++++++++++++--
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 6fd81d9147..c8969c0436 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -103,10 +103,12 @@ class Dataset:
 
     Attributes
     ----------
-    supplementaries : list[Dataset]
+    supplementaries: list[Dataset]
         List of supplementary datasets.
     facets: :obj:`esmvalcore.typing.Facets`
         Facets describing the dataset.
+    input_datasets: list[Dataset]
+        Input datasets.
     """
 
     _SUMMARY_FACETS: tuple[str, ...] = (
@@ -406,7 +408,6 @@ def from_files(self) -> Iterator[Dataset]:
 
         The facet values for local files are retrieved from the directory tree
         where the directories represent the facets values.
-        Reading facet values from file names is not yet supported.
         See :ref:`CMOR-DRS` for more information on this kind of file
         organization.
 
@@ -424,6 +425,10 @@ def from_files(self) -> Iterator[Dataset]:
         Supplementary datasets will in inherit the facet values from the main
         dataset for those facets listed in :obj:`INHERITED_FACETS`.
 
+        This also works for :ref:`derived variables <Variable derivation>`. The
+        input datasets that are can be used for derivation are available via
+        :attr:`Dataset.input_datasets`.
+
         Examples
         --------
         See :ref:`/notebooks/discovering-data.ipynb` for example use cases.
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index cbf138e2d7..290e29c84e 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -2,6 +2,7 @@
 
 import importlib
 import logging
+from collections.abc import Sequence
 from copy import deepcopy
 from pathlib import Path
 
@@ -70,7 +71,7 @@ def get_required(short_name: str, project: str) -> list[Facets]:
 
 
 def derive(
-    cubes: CubeList,
+    cubes: Sequence[Cube],
     short_name: str,
     long_name: str,
     units: str | Unit,
@@ -81,8 +82,7 @@ def derive(
     Parameters
     ----------
     cubes:
-        Includes all the needed variables for derivation defined in
-        :func:`get_required`.
+        Includes all the needed variables for derivation.
     short_name:
         short_name
     long_name:
@@ -96,6 +96,36 @@ def derive(
     -------
     iris.cube.Cube
         The new derived variable.
+
+    Examples
+    --------
+    Input variables for derivation can be obtained via
+    :attr:`Dataset.input_datasets`.
+
+    To derive the longwave cloud radiative effect (LWCRE) for the model CESM2,
+    you can use:
+
+    >>> from esmvalcore.dataset import Dataset
+    from esmv>>> from esmvalcore.preprocessor import derive
+    >>> dataset = Dataset(
+    ...     project="CMIP6",
+    ...     dataset="CESM2",
+    ...     exp="historical",
+    ...     ensemble="r1i1p1f1",
+    ...     grid="gn",
+    ...     timerange="2000/2014",
+    ...     short_name="lwcre",
+    ...     mip="Amon",
+    ...     derive=True,
+    ... )
+    >>> cubes = [d.load() for d in dataset.input_datasets]
+    >>> cube = derive(
+    ...     cubes,
+    ...     short_name="lwcre",
+    ...     long_name="TOA Longwave Cloud Radiative Effect",
+    ...     units="W m-2",
+    ... )  # doctest: +SKIP
+
     """
     if short_name == cubes[0].var_name:
         return cubes[0]

From acaf9fdb75bf734f32dc51f56555418a8255db65 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 16 Jul 2025 18:08:41 +0200
Subject: [PATCH 62/85] Expand notebook

---
 notebooks/discovering-data.ipynb | 243 ++++++++++++++++++++++++++++++-
 1 file changed, 236 insertions(+), 7 deletions(-)

diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb
index d6c9001ef2..9078c2523e 100644
--- a/notebooks/discovering-data.ipynb
+++ b/notebooks/discovering-data.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "f0ccfe7f-c535-4606-99ce-be24960aece1",
    "metadata": {},
    "outputs": [],
@@ -89,7 +89,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 778 datasets, showing the first 10:\n"
+      "Found 727 datasets, showing the first 10:\n"
      ]
     },
     {
@@ -168,20 +168,20 @@
        "  'grid': 'gn',\n",
        "  'institute': 'AWI'},\n",
        " Dataset:\n",
-       " {'dataset': 'BCC-CSM2-MR',\n",
+       " {'dataset': 'AWI-ESM-1-REcoM',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'tas',\n",
        "  'ensemble': 'r1i1p1f1',\n",
        "  'exp': 'historical',\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'BCC'},\n",
+       "  'institute': 'AWI'},\n",
        " Dataset:\n",
        " {'dataset': 'BCC-CSM2-MR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'tas',\n",
-       "  'ensemble': 'r2i1p1f1',\n",
+       "  'ensemble': 'r1i1p1f1',\n",
        "  'exp': 'historical',\n",
        "  'grid': 'gn',\n",
        "  'institute': 'BCC'}]"
@@ -253,7 +253,7 @@
     {
      "data": {
       "text/plain": [
-       "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]"
+       "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]"
       ]
      },
      "execution_count": 6,
@@ -282,7 +282,7 @@
     {
      "data": {
       "text/plain": [
-       "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')"
+       "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')"
       ]
      },
      "execution_count": 7,
@@ -312,6 +312,235 @@
    "source": [
     "download(dataset.files, CFG[\"download_dir\"])"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3006d90",
+   "metadata": {},
+   "source": [
+    "`Dataset.from_files` can also handle derived variables properly:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b75314e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_template = Dataset(\n",
+    "    short_name=\"lwcre\",\n",
+    "    mip=\"Amon\",\n",
+    "    project=\"CMIP6\",\n",
+    "    exp=\"historical\",\n",
+    "    dataset=\"*\",\n",
+    "    institute=\"*\",\n",
+    "    ensemble=\"r1i1p1f1\",\n",
+    "    grid=\"gn\",\n",
+    "    derive=True,\n",
+    "    force_derivation=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18e3a0b7",
+   "metadata": {},
+   "source": [
+    "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b87c247f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 36 datasets, showing the first 10:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[Dataset:\n",
+       " {'dataset': 'TaiESM1',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AS-RCEC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'AWI-CM-1-1-MR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AWI'},\n",
+       " Dataset:\n",
+       " {'dataset': 'AWI-ESM-1-1-LR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AWI'},\n",
+       " Dataset:\n",
+       " {'dataset': 'AWI-ESM-1-REcoM',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'AWI'},\n",
+       " Dataset:\n",
+       " {'dataset': 'BCC-CSM2-MR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'BCC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'BCC-ESM1',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'BCC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CAMS-CSM1-0',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAMS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CAS-ESM2-0',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'FGOALS-g3',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'IITM-ESM',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CCCR-IITM'}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "datasets = list(dataset_template.from_files())\n",
+    "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n",
+    "datasets[:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f00a886f",
+   "metadata": {},
+   "source": [
+    "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets will be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c5edfa65",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = datasets[0]\n",
+    "dataset.files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "97cdf12d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rlut\n",
+      "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n",
+      "rlutcs\n",
+      "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for d in dataset.input_datasets:\n",
+    "    print(d[\"short_name\"])\n",
+    "    print(d.files)"
+   ]
   }
  ],
  "metadata": {

From f6e531b867e9cae9b9d8a1d467b366f99de7412e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 16 Jul 2025 18:13:36 +0200
Subject: [PATCH 63/85] Fix doc build

---
 esmvalcore/dataset.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index c8969c0436..291a0b16df 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -107,8 +107,6 @@ class Dataset:
         List of supplementary datasets.
     facets: :obj:`esmvalcore.typing.Facets`
         Facets describing the dataset.
-    input_datasets: list[Dataset]
-        Input datasets.
     """
 
     _SUMMARY_FACETS: tuple[str, ...] = (

From 30b6f537881190381161fdc98f5708adfd82b9b2 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 16 Jul 2025 18:24:32 +0200
Subject: [PATCH 64/85] Update doc

---
 esmvalcore/dataset.py            |  4 +++-
 notebooks/discovering-data.ipynb | 18 +++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 291a0b16df..5fc24adb53 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -231,6 +231,8 @@ def input_datasets(self) -> list[Dataset]:
         Derivation is necessary if the facet ``force_derivation=True`` is set
         or no files for the dataset itself are available.
 
+        See also :func:`esmvalcore.preprocessor.derive` for an example usage.
+
         """
         if self._input_datasets:
             return self._input_datasets
@@ -424,7 +426,7 @@ def from_files(self) -> Iterator[Dataset]:
         dataset for those facets listed in :obj:`INHERITED_FACETS`.
 
         This also works for :ref:`derived variables <Variable derivation>`. The
-        input datasets that are can be used for derivation are available via
+        input datasets that are necessary for derivation can be accessed via
         :attr:`Dataset.input_datasets`.
 
         Examples
diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb
index 9078c2523e..581e8ca249 100644
--- a/notebooks/discovering-data.ipynb
+++ b/notebooks/discovering-data.ipynb
@@ -342,14 +342,6 @@
     ")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "18e3a0b7",
-   "metadata": {},
-   "source": [
-    "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 10,
@@ -489,12 +481,20 @@
     "datasets[:10]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "18e3a0b7",
+   "metadata": {},
+   "source": [
+    "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "f00a886f",
    "metadata": {},
    "source": [
-    "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets will be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:"
+    "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:"
    ]
   },
   {

From 1cdfef2f9e04882ea76c8c032aef4bce2e300e61 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 16 Jul 2025 18:27:14 +0200
Subject: [PATCH 65/85] Better derivation example

---
 esmvalcore/preprocessor/_derive/__init__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index 290e29c84e..5c14367dd6 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -100,13 +100,13 @@ def derive(
     Examples
     --------
     Input variables for derivation can be obtained via
-    :attr:`Dataset.input_datasets`.
+    :attr:`esmvalcore.dataset.Dataset.input_datasets`.
 
-    To derive the longwave cloud radiative effect (LWCRE) for the model CESM2,
-    you can use:
+    For example, to derive the longwave cloud radiative effect (LWCRE) for the
+    model CESM2, you can use:
 
     >>> from esmvalcore.dataset import Dataset
-    from esmv>>> from esmvalcore.preprocessor import derive
+    >>> from esmvalcore.preprocessor import derive
     >>> dataset = Dataset(
     ...     project="CMIP6",
     ...     dataset="CESM2",
@@ -124,7 +124,9 @@ def derive(
     ...     short_name="lwcre",
     ...     long_name="TOA Longwave Cloud Radiative Effect",
     ...     units="W m-2",
-    ... )  # doctest: +SKIP
+    ... )
+    >>> print(cube.var_name)
+    lwcre  # doctest: +SKIP
 
     """
     if short_name == cubes[0].var_name:

From 7ec32814f5fffc070f47f63005b5042d7aa2b076 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 9 Jan 2026 09:59:39 +0100
Subject: [PATCH 66/85] Load default data sources in global session fixture and
 fix first tests

---
 tests/conftest.py          | 40 +++++++++++++++++++++++++++++-
 tests/unit/test_dataset.py | 50 +++-----------------------------------
 2 files changed, 43 insertions(+), 47 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 46cabf58f9..3c19e4c4df 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import importlib
 import warnings
 from copy import deepcopy
 from functools import lru_cache
@@ -7,6 +8,7 @@
 
 import numpy as np
 import pytest
+import yaml
 from cf_units import Unit
 from iris.coords import (
     AncillaryVariable,
@@ -17,6 +19,7 @@
 )
 from iris.cube import Cube
 
+import esmvalcore
 from esmvalcore.config import CFG, Config
 
 if TYPE_CHECKING:
@@ -55,6 +58,33 @@ def ignore_existing_user_config(
     monkeypatch.setattr(CFG, "_mapping", cfg_default._mapping)
 
 
+@lru_cache
+def _load_default_data_sources() -> dict[
+    str,
+    dict[str, dict[str, dict[str, dict[str, str]]]],
+]:
+    """Load default data sources for local users."""
+    cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = {
+        "projects": {},
+    }
+    for file in (
+        "data-local.yml",
+        "data-local-esmvaltool.yml",
+        "data-native-cesm.yml",
+        "data-native-emac.yml",
+        "data-native-icon.yml",
+        "data-native-ipslcm.yml",
+    ):
+        with importlib.resources.as_file(
+            importlib.resources.files(esmvalcore.config)
+            / "configurations"
+            / file,
+        ) as config_file:
+            content = config_file.read_text(encoding="utf-8")
+            cfg["projects"].update(yaml.safe_load(content)["projects"])
+    return cfg
+
+
 @pytest.fixture
 def session(
     tmp_path: Path,
@@ -63,7 +93,15 @@ def session(
 ) -> Session:
     """Session object with default settings."""
     monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output")
-    return CFG.start_session("recipe_test")
+    session = CFG.start_session("recipe_test")
+    projects = _load_default_data_sources()["projects"]
+    for project in projects:
+        print(project)
+        data_sources = projects[project]["data"]
+        for data_source in data_sources.values():
+            data_source["rootpath"] = str(tmp_path)
+        session["projects"][project]["data"] = data_sources
+    return session
 
 
 @pytest.fixture
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 071480d1fb..b6aa3a8bab 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1,16 +1,13 @@
 from __future__ import annotations
 
-import importlib.resources
 import logging
 import textwrap
 from collections import defaultdict
-from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pyesgf
 import pytest
-import yaml
 
 import esmvalcore.dataset
 import esmvalcore.io.esgf
@@ -25,45 +22,6 @@
     from esmvalcore.typing import Facets
 
 
-@lru_cache
-def _load_default_data_sources() -> dict[
-    str,
-    dict[str, dict[str, dict[str, dict[str, str]]]],
-]:
-    """Load default data sources for local users."""
-    cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = {
-        "projects": {},
-    }
-    for file in (
-        "data-local.yml",
-        "data-local-esmvaltool.yml",
-        "data-native-cesm.yml",
-        "data-native-emac.yml",
-        "data-native-icon.yml",
-        "data-native-ipslcm.yml",
-    ):
-        with importlib.resources.as_file(
-            importlib.resources.files(esmvalcore.config)
-            / "configurations"
-            / file,
-        ) as config_file:
-            content = config_file.read_text(encoding="utf-8")
-            cfg["projects"].update(yaml.safe_load(content)["projects"])
-    return cfg
-
-
-@pytest.fixture
-def session(tmp_path: Path, session: Session) -> Session:
-    """Session fixture with default local data sources."""
-    projects = _load_default_data_sources()["projects"]
-    for project in projects:
-        data_sources = projects[project]["data"]
-        for data_source in data_sources.values():
-            data_source["rootpath"] = str(tmp_path)
-        session["projects"][project]["data"] = data_sources
-    return session
-
-
 def test_repr():
     ds = Dataset(short_name="tas", dataset="dataset1")
 
@@ -2267,7 +2225,7 @@ def test_set_version_non_derived_var():
     assert dataset.supplementaries[0].facets["version"] == "v3"
 
 
-def test_set_version_derived_var(monkeypatch):
+def test_set_version_derived_var(monkeypatch, session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.add_supplementary(short_name="areacella")
     dataset.files = []
@@ -2868,7 +2826,7 @@ def test_derivation_necessary_no_force_derivation_no_files(
     assert dataset._derivation_necessary() is True
 
 
-def test_derivation_necessary_no_force_derivation_no_files_glob():
+def test_derivation_necessary_no_force_derivation_no_files_glob(session):
     dataset = Dataset(
         **{**OBS6_SAT_FACETS, "timerange": "*"},
         short_name="lwcre",
@@ -2952,7 +2910,7 @@ def test_add_derived_supplementary_to_derived():
     assert dataset.supplementaries[0] == expected_supplementary
 
 
-def test_input_datasets_derivation():
+def test_input_datasets_derivation(session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.add_supplementary(short_name="pr")
 
@@ -3008,7 +2966,7 @@ def test_input_datasets_no_force_derivation(tmp_path, session):
     assert dataset.input_datasets == [dataset]
 
 
-def test_input_datasets_no_derivation_available():
+def test_input_datasets_no_derivation_available(session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True)
 
     msg = r"Cannot derive variable 'tas': no derivation script available"

From 2bfc1fa5a5b5a028e1af908e8ee0ecf65b37b9d6 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 9 Jan 2026 11:08:40 +0100
Subject: [PATCH 67/85] Fixed recipe test

---
 tests/integration/recipe/test_recipe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index 5d12e15219..7ebddfdfde 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -1707,7 +1707,7 @@ def test_alias_generation(tmp_path, patched_datafinder, session):  # noqa: C901,
                 assert dataset["alias"] == "CORDEX_ICHEC-EC-EARTH"
             else:
                 assert dataset["alias"] == "CORDEX_MIROC-MIROC5"
-        elif dataset["version"] == 1:
+        elif dataset["version"] == "1":
             assert dataset["alias"] == "OBS_1"
         else:
             assert dataset["alias"] == "OBS_2"

From f0f2b6e61ca90f49a035fdc21c193077be03362f Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 9 Jan 2026 12:47:03 +0100
Subject: [PATCH 68/85] We don't need to raise an error if no files are found
 when updating time ranges

---
 esmvalcore/dataset.py      | 5 +++--
 tests/unit/test_dataset.py | 5 ++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index f147e54922..8ebee6aac8 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -1112,8 +1112,9 @@ def _update_timerange(self) -> None:
             dataset = self.copy()
             dataset.facets.pop("timerange")
             dataset.supplementaries = []
-            check.data_availability(dataset)
-            if all("timerange" in f.facets for f in dataset.files):
+            if dataset.files and all(
+                "timerange" in f.facets for f in dataset.files
+            ):
                 # "timerange" can only be reliably computed when all DataElements
                 # provide it.
                 intervals = [
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index b6aa3a8bab..524f966076 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -2333,9 +2333,8 @@ def test_update_timerange_no_files(session, search_data):
     }
     dataset = Dataset(**variable)
     dataset.files = []
-    msg = r"Missing data for Dataset: tas, Amon, CMIP6, HadGEM3-GC31-LL.*"
-    with pytest.raises(InputFilesNotFound, match=msg):
-        dataset._update_timerange()
+    dataset._update_timerange()
+    assert "timerange" not in dataset.facets
 
 
 def test_update_timerange_typeerror():

From cffdeeac1be3ddb2e2e0a58af03cf74a838f7f32 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 9 Jan 2026 15:08:34 +0100
Subject: [PATCH 69/85] Fixed existing tests and add one for data with
 unavailable years

---
 esmvalcore/dataset.py      |  2 +-
 tests/unit/test_dataset.py | 95 ++++++++++++++++++++++++++++++++------
 2 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 8ebee6aac8..f2fd2e7b23 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -325,7 +325,7 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
                     all_datasets[-1].append((updated_facets, new_ds))
 
             # Only consider those datasets that contain all input variables
-            # necessary for derivation
+            # necessary for derivation with the same facets (e.g., skip those where provided timeranges are different)
             for updated_facets, new_ds in all_datasets[0]:
                 other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
                 if all(updated_facets in facets for facets in other_facets):
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 524f966076..5a14f38d5f 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1253,6 +1253,18 @@ def rlut_file(tmp_path):
     return rlut
 
 
+@pytest.fixture
+def rlut_file_future(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlut_2100-2101.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
 @pytest.fixture
 def rlut_file_ground(tmp_path):
     input_dir = tmp_path / "Tier2" / "SAT"
@@ -1332,7 +1344,7 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session):
     expected_input_dataset.session = session
 
     assert datasets[0].input_datasets == [expected_input_dataset]
-    assert expected_input_dataset.files == [lwcre_file]
+    assert datasets[0].input_datasets[0].files == [lwcre_file]
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1418,8 +1430,8 @@ def test_from_files_with_derived_no_derivation_glob(
         strict=True,
     ):
         assert dataset.input_datasets == [expected]
-    assert expected_input_datasets[0].files == [lwcre_file_ground]
-    assert expected_input_datasets[1].files == [lwcre_file]
+    assert datasets[0].input_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].input_datasets[0].files == [lwcre_file]
 
 
 def test_from_files_with_derived(rlut_file, rlutcs_file, session):
@@ -1466,8 +1478,65 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
         expected_ds.session = session
 
     assert datasets[0].input_datasets == expected_input_datasets
-    assert expected_input_datasets[0].files == [rlut_file]
-    assert expected_input_datasets[1].files == [rlutcs_file]
+    assert dataset.input_datasets[0].files == [rlut_file]
+    assert dataset.input_datasets[1].files == [rlutcs_file]
+
+
+def test_from_files_with_derived_unavailable_years(
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+        short_name="lwcre",
+        derive=True,
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_input_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_input_datasets:
+        expected_ds.session = session
+
+    assert datasets[0].input_datasets == expected_input_datasets
+    assert dataset.input_datasets[0].files == []
+    assert dataset.input_datasets[1].files == []
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1528,8 +1597,8 @@ def test_from_files_with_derived_glob(
         expected_ds.session = session
 
     assert datasets[0].input_datasets == expected_input_datasets
-    assert expected_input_datasets[0].files == [rlut_file]
-    assert expected_input_datasets[1].files == [rlutcs_file]
+    assert datasets[0].input_datasets[0].files == [rlut_file]
+    assert datasets[0].input_datasets[1].files == [rlutcs_file]
 
     log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
     msg = "Not all necessary input variables to derive 'lwcre' are available"
@@ -1677,8 +1746,8 @@ def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
         strict=True,
     ):
         assert dataset.input_datasets == [expected]
-    assert expected_input_datasets[0].files == [lwcre_file_ground]
-    assert expected_input_datasets[1].files == [lwcre_file]
+    assert datasets[0].input_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].input_datasets[0].files == [lwcre_file]
 
 
 def test_from_files_with_derived_force_derivation(
@@ -1742,8 +1811,8 @@ def test_from_files_with_derived_force_derivation(
         expected_ds.session = session
 
     assert datasets[0].input_datasets == expected_input_datasets
-    assert expected_input_datasets[0].files == [rlut_file]
-    assert expected_input_datasets[1].files == [rlutcs_file]
+    assert dataset.input_datasets[0].files == [rlut_file]
+    assert dataset.input_datasets[1].files == [rlutcs_file]
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1814,8 +1883,8 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
         expected_ds.session = session
 
     assert datasets[0].input_datasets == expected_input_datasets
-    assert expected_input_datasets[0].files == [rlut_file]
-    assert expected_input_datasets[1].files == [rlutcs_file]
+    assert datasets[0].input_datasets[0].files == [rlut_file]
+    assert datasets[0].input_datasets[1].files == [rlutcs_file]
 
     log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
     msg = "Not all necessary input variables to derive 'lwcre' are available"

From dec25bc6b530bc3409eb3d887357840ecd7c6441 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 9 Jan 2026 16:28:00 +0100
Subject: [PATCH 70/85] Use static methods to make sure that original Dataset
 instance is not overwritten

---
 esmvalcore/dataset.py      | 59 +++++++++++++++++++++-----------------
 tests/unit/test_dataset.py |  2 +-
 2 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index f2fd2e7b23..3b8ef0092b 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -235,12 +235,11 @@ def input_datasets(self) -> list[Dataset]:
             return self._input_datasets
 
         if not self._derivation_necessary():
-            input_datasets = [self]
+            self._input_datasets = [self]
         else:
-            input_datasets = self._get_input_datasets()
+            self._input_datasets = self._get_input_datasets()
 
-        self._input_datasets = input_datasets
-        return input_datasets
+        return self._input_datasets
 
     @staticmethod
     def _file_to_dataset(
@@ -279,10 +278,12 @@ def _file_to_dataset(
 
         return new_dataset
 
-    def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
+    @staticmethod
+    def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:  # noqa: C901
         """Yield datasets based on the available files.
 
-        This function requires that self.facets['mip'] is not a glob pattern.
+        This function requires that dataset.facets['mip'] is not a glob
+        pattern.
 
         Does take variable derivation into account, i.e., datasets available
         through variable derivation are returned.
@@ -291,41 +292,42 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
         datasets_found = False
 
         # If no forced derivation is requested, search for datasets based on
-        # files from self
-        if not self._is_force_derived():
-            for dataset in self._get_available_datasets(self):
+        # files from dataset
+        if not dataset._is_force_derived():
+            for available_ds in Dataset._get_available_datasets(dataset):
                 datasets_found = True
-                yield dataset
+                yield available_ds
 
         # For variables that cannot be derived, we are done here
-        if not self._is_derived():
+        if not dataset._is_derived():
             return
 
         # If forced derivation is requested or no datasets based on files from
-        # self have been found, search for datasets based on files from input
-        # datasets
-        if self._is_force_derived() or not datasets_found:
+        # dataset have been found, search for datasets based on files from
+        # input datasets
+        if dataset._is_force_derived() or not datasets_found:
             all_datasets: list[list[tuple[dict, Dataset]]] = []
-            for input_dataset in self._get_input_datasets():
+            for input_dataset in dataset._get_input_datasets():
                 all_datasets.append([])
-                for expanded_ds in self._get_available_datasets(
+                for expanded_ds in Dataset._get_available_datasets(
                     input_dataset,
                 ):
                     updated_facets = {}
-                    for key, value in self.facets.items():
+                    for key, value in dataset.facets.items():
                         if _isglob(value):
                             if key in expanded_ds.facets and not _isglob(
                                 expanded_ds[key],
                             ):
                                 updated_facets[key] = expanded_ds.facets[key]
-                    new_ds = self.copy()
+                    new_ds = dataset.copy()
                     new_ds.facets.update(updated_facets)
-                    new_ds.supplementaries = self.supplementaries
+                    new_ds.supplementaries = dataset.supplementaries
 
                     all_datasets[-1].append((updated_facets, new_ds))
 
-            # Only consider those datasets that contain all input variables
-            # necessary for derivation with the same facets (e.g., skip those where provided timeranges are different)
+            # Only consider those datasets that contain all required variables
+            # with identical facets (e.g., skip those with different
+            # timeranges)
             for updated_facets, new_ds in all_datasets[0]:
                 other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
                 if all(updated_facets in facets for facets in other_facets):
@@ -334,12 +336,13 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]:  # noqa: C901
                     logger.debug(
                         "Not all necessary input variables to derive '%s' are "
                         "available for %s with facets %s",
-                        self["short_name"],
+                        dataset["short_name"],
                         new_ds.summary(shorten=True),
                         updated_facets,
                     )
 
-    def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]:
+    @staticmethod
+    def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
         """Yield datasets based on the available files.
 
         This function requires that self.facets['mip'] is not a glob pattern.
@@ -357,13 +360,13 @@ def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]:
         partially_defined = []
         expanded = False
         for file in dataset_template.files:
-            new_dataset = self._file_to_dataset(dataset, file)
+            new_dataset = Dataset._file_to_dataset(dataset, file)
             # Do not use the timerange facet from the file because there may be
             # multiple files per dataset.
             new_dataset.facets.pop("timerange", None)
             # Restore the original timerange facet if it was specified.
-            if "timerange" in self.facets:
-                new_dataset.facets["timerange"] = self.facets["timerange"]
+            if "timerange" in dataset.facets:
+                new_dataset.facets["timerange"] = dataset.facets["timerange"]
 
             # Filter out identical datasets
             facetset = frozenset(
@@ -455,7 +458,9 @@ def from_files(self) -> Iterator[Dataset]:
 
             for mip in mips:
                 dataset_template = self.copy(mip=mip)
-                for dataset in dataset_template._get_all_available_datasets():  # noqa: SLF001
+                for dataset in self._get_all_available_datasets(
+                    dataset_template,
+                ):
                     dataset._supplementaries_from_files()  # noqa: SLF001
                     expanded = True
                     yield dataset
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 5a14f38d5f..17ecd93535 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1657,7 +1657,7 @@ def test_from_files_with_derived_no_force_derivation(
     expected_input_dataset.session = session
 
     assert datasets[0].input_datasets == [expected_input_dataset]
-    assert expected_input_dataset.files == [lwcre_file]
+    assert datasets[0].input_datasets[0].files == [lwcre_file]
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])

From ff0cdd532d577a794e33647db7e5b299b776eb20 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 9 Jan 2026 17:22:29 +0100
Subject: [PATCH 71/85] input_datasets -> required_datasets

---
 esmvalcore/_recipe/recipe.py                |  12 +-
 esmvalcore/_recipe/to_datasets.py           |  23 ++--
 esmvalcore/dataset.py                       |  52 ++++----
 esmvalcore/preprocessor/_derive/__init__.py |   6 +-
 tests/integration/recipe/test_recipe.py     |  10 +-
 tests/unit/recipe/test_to_datasets.py       |   8 +-
 tests/unit/test_dataset.py                  | 129 +++++++++++---------
 7 files changed, 127 insertions(+), 113 deletions(-)

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index a3b14c99f8..d54d35fba5 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -52,7 +52,7 @@
 from . import check
 from .from_datasets import datasets_to_recipe
 from .to_datasets import (
-    _get_input_datasets,
+    _get_required_datasets,
     _representative_datasets,
 )
 
@@ -670,8 +670,8 @@ def _get_preprocessor_products(
         _update_preproc_functions(settings, dataset, datasets, missing_vars)
         _add_dataset_specific_settings(dataset, settings)
         check.preprocessor_supplementaries(dataset, settings)
-        input_datasets = _get_input_datasets(dataset)
-        missing = _check_input_files(input_datasets)
+        required_datasets = _get_required_datasets(dataset)
+        missing = _check_input_files(required_datasets)
         if missing:
             if _allow_skipping(dataset):
                 logger.info("Skipping: %s", missing)
@@ -680,15 +680,15 @@ def _get_preprocessor_products(
             continue
         dataset.set_version()
         USED_DATASETS.append(dataset)
-        _schedule_for_download(input_datasets)
-        _log_input_files(input_datasets)
+        _schedule_for_download(required_datasets)
+        _log_input_files(required_datasets)
         logger.info("Found input files for %s", dataset.summary(shorten=True))
         filename = _get_preprocessor_filename(dataset)
         product = PreprocessorFile(
             filename=filename,
             attributes=dataset.facets,
             settings=settings,
-            datasets=input_datasets,
+            datasets=required_datasets,
         )
 
         products.add(product)
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index b01e04707a..e992c767f8 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -489,30 +489,33 @@ def _report_unexpanded_globs(
     return msg
 
 
-def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
-    """Determine the input datasets needed for deriving `dataset`."""
+def _get_required_datasets(dataset: Dataset) -> list[Dataset]:
+    """Determine the datasets required for deriving `dataset`."""
     if not dataset._derivation_necessary():  # noqa: SLF001
-        return dataset.input_datasets
+        return dataset.required_datasets
 
     # Skip optional datasets if no data is available
-    input_datasets: list[Dataset] = []
-    for input_dataset in dataset.input_datasets:
-        if input_dataset.facets.get("optional") and not input_dataset.files:
+    required_datasets: list[Dataset] = []
+    for required_dataset in dataset.required_datasets:
+        if (
+            required_dataset.facets.get("optional")
+            and not required_dataset.files
+        ):
             logger.info(
                 "Skipping: no data found for %s which is marked as 'optional'",
-                input_dataset,
+                required_dataset,
             )
         else:
-            input_datasets.append(input_dataset)
+            required_datasets.append(required_dataset)
 
-    return input_datasets
+    return required_datasets
 
 
 def _representative_datasets(dataset: Dataset) -> list[Dataset]:
     """Find representative datasets for all input variables."""
     copy = dataset.copy()
     copy.supplementaries = []
-    representative_datasets = _get_input_datasets(copy)
+    representative_datasets = _get_required_datasets(copy)
     for representative_dataset in representative_datasets:
         representative_dataset.supplementaries = dataset.supplementaries
     return representative_datasets
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 3b8ef0092b..9e03c3ddbf 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -130,7 +130,7 @@ def __init__(self, **facets: FacetValue) -> None:
         self._session: Session | None = None
         self._files: Sequence[DataElement] | None = None
         self._used_data_sources: Sequence[DataSource] = []
-        self._input_datasets: list[Dataset] = []
+        self._required_datasets: list[Dataset] | None = None
 
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
@@ -195,29 +195,29 @@ def _derivation_necessary(self) -> bool:
 
         return not ds_copy.files
 
-    def _get_input_datasets(self) -> list[Dataset]:
-        """Get input datasets."""
-        input_datasets: list[Dataset] = []
+    def _get_required_datasets(self) -> list[Dataset]:
+        """Get required datasets for derivation."""
+        required_datasets: list[Dataset] = []
         required_vars_facets = get_required(
             self.facets["short_name"],  # type: ignore
             self.facets["project"],  # type: ignore
         )
 
         for required_facets in required_vars_facets:
-            input_dataset = self._copy(derive=False, force_derivation=False)
+            required_dataset = self._copy(derive=False, force_derivation=False)
             keep = {"alias", "recipe_dataset_index", *self.minimal_facets}
-            input_dataset.facets = {
-                k: v for k, v in input_dataset.facets.items() if k in keep
+            required_dataset.facets = {
+                k: v for k, v in required_dataset.facets.items() if k in keep
             }
-            input_dataset.facets.update(required_facets)
-            input_dataset.augment_facets()
-            input_datasets.append(input_dataset)
+            required_dataset.facets.update(required_facets)
+            required_dataset.augment_facets()
+            required_datasets.append(required_dataset)
 
-        return input_datasets
+        return required_datasets
 
     @property
-    def input_datasets(self) -> list[Dataset]:
-        """Get input datasets.
+    def required_datasets(self) -> list[Dataset]:
+        """Get required datasets.
 
         For non-derived variables (i.e., those with facet ``derive=False``),
         this will simply return the dataset itself in a list.
@@ -231,15 +231,15 @@ def input_datasets(self) -> list[Dataset]:
         See also :func:`esmvalcore.preprocessor.derive` for an example usage.
 
         """
-        if self._input_datasets:
-            return self._input_datasets
+        if self._required_datasets is not None:
+            return self._required_datasets
 
         if not self._derivation_necessary():
-            self._input_datasets = [self]
+            self._required_datasets = [self]
         else:
-            self._input_datasets = self._get_input_datasets()
+            self._required_datasets = self._get_required_datasets()
 
-        return self._input_datasets
+        return self._required_datasets
 
     @staticmethod
     def _file_to_dataset(
@@ -304,13 +304,13 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:  # noqa:
 
         # If forced derivation is requested or no datasets based on files from
         # dataset have been found, search for datasets based on files from
-        # input datasets
+        # required datasets
         if dataset._is_force_derived() or not datasets_found:
             all_datasets: list[list[tuple[dict, Dataset]]] = []
-            for input_dataset in dataset._get_input_datasets():
+            for required_dataset in dataset.required_datasets:
                 all_datasets.append([])
                 for expanded_ds in Dataset._get_available_datasets(
-                    input_dataset,
+                    required_dataset,
                 ):
                     updated_facets = {}
                     for key, value in dataset.facets.items():
@@ -334,7 +334,7 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:  # noqa:
                     yield new_ds
                 else:
                     logger.debug(
-                        "Not all necessary input variables to derive '%s' are "
+                        "Not all variables required to derive '%s' are "
                         "available for %s with facets %s",
                         dataset["short_name"],
                         new_ds.summary(shorten=True),
@@ -429,8 +429,8 @@ def from_files(self) -> Iterator[Dataset]:
         dataset for those facets listed in :obj:`INHERITED_FACETS`.
 
         This also works for :ref:`derived variables <Variable derivation>`. The
-        input datasets that are necessary for derivation can be accessed via
-        :attr:`Dataset.input_datasets`.
+        datasets required for derivation can be accessed via
+        :attr:`Dataset.required_datasets`.
 
         Examples
         --------
@@ -748,8 +748,8 @@ def _get_version(dataset: Dataset) -> str | list[str]:
     def set_version(self) -> None:
         """Set the ``'version'`` facet based on the available data."""
         versions: set[str] = set()
-        for input_dataset in self.input_datasets:
-            version = self._get_version(input_dataset)
+        for required_dataset in self.required_datasets:
+            version = self._get_version(required_dataset)
             if version:
                 if isinstance(version, list):
                     versions.update(version)
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index 3aa3b74e2c..3817b86bc7 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -107,8 +107,8 @@ def derive(
 
     Examples
     --------
-    Input variables for derivation can be obtained via
-    :attr:`esmvalcore.dataset.Dataset.input_datasets`.
+    Required variables for derivation can be obtained via
+    :attr:`esmvalcore.dataset.Dataset.required_datasets`.
 
     For example, to derive the longwave cloud radiative effect (LWCRE) for the
     model CESM2, you can use:
@@ -126,7 +126,7 @@ def derive(
     ...     mip="Amon",
     ...     derive=True,
     ... )
-    >>> cubes = [d.load() for d in dataset.input_datasets]
+    >>> cubes = [d.load() for d in dataset.required_datasets]
     >>> cube = derive(
     ...     cubes,
     ...     short_name="lwcre",
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index 7ebddfdfde..0a8fbc4f79 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -22,7 +22,7 @@
 import esmvalcore.io.esgf
 import esmvalcore.io.local
 from esmvalcore._recipe.recipe import (
-    _get_input_datasets,
+    _get_required_datasets,
     _representative_datasets,
     read_recipe_file,
 )
@@ -2609,7 +2609,7 @@ def test_representative_dataset_derived_var(
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
-        # Added/changed by Dataset._get_input_datasets()
+        # Added/changed by Dataset._get_required_datasets()
         "derive": False,
         "force_derivation": False,
     }
@@ -2679,7 +2679,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
-        # Added/changed by Dataset._get_input_datasets()
+        # Added/changed by Dataset._get_required_datasets()
         "derive": False,
         "force_derivation": False,
     }
@@ -2704,14 +2704,14 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
-        # Added/changed by Dataset._get_input_datasets()
+        # Added/changed by Dataset._get_required_datasets()
         "derive": False,
         "force_derivation": False,
     }
     rsuscs = Dataset(**rsuscs_facets)
     rsuscs.session = session
 
-    alb_derive_input = _get_input_datasets(alb)
+    alb_derive_input = _get_required_datasets(alb)
     assert alb_derive_input == [rsdscs, rsuscs]
 
 
diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py
index 1c279eafa6..443ec9b80a 100644
--- a/tests/unit/recipe/test_to_datasets.py
+++ b/tests/unit/recipe/test_to_datasets.py
@@ -285,7 +285,7 @@ def test_merge_supplementaries_missing_short_name_fails(session):
         Dataset.from_recipe(recipe_txt, session)
 
 
-def test_get_input_datasets_derive(session):
+def test_get_required_datasets_derive(session):
     dataset = Dataset(
         dataset="ERA5",
         project="native6",
@@ -300,7 +300,7 @@ def test_get_input_datasets_derive(session):
         type="reanaly",
         version="v1",
     )
-    rlds, rlns = to_datasets._get_input_datasets(dataset)
+    rlds, rlns = to_datasets._get_required_datasets(dataset)
     assert rlds["short_name"] == "rlds"
     assert rlds["long_name"] == "Surface Downwelling Longwave Radiation"
     assert rlds["frequency"] == "1hr"
@@ -309,7 +309,7 @@ def test_get_input_datasets_derive(session):
     assert rlns["frequency"] == "1hr"
 
 
-def test_get_input_datasets_derive_optional(caplog, tmp_path, session):
+def test_get_required_datasets_optional(caplog, tmp_path, session):
     facets = {
         "project": "OBS6",
         "dataset": "SAT",
@@ -333,7 +333,7 @@ def test_get_input_datasets_derive_optional(caplog, tmp_path, session):
     dataset.session = session
 
     with caplog.at_level(logging.INFO):
-        datasets = to_datasets._get_input_datasets(dataset)
+        datasets = to_datasets._get_required_datasets(dataset)
 
     expected = Dataset(
         dataset="SAT",
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 17ecd93535..e8cd1ca67a 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1317,7 +1317,7 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session):
     assert datasets[0].files == [lwcre_file]
     assert datasets[0].supplementaries[0].files == []
 
-    expected_input_dataset = Dataset(
+    expected_required_dataset = Dataset(
         **OBS6_SAT_FACETS,
         short_name="lwcre",
         derive=True,
@@ -1328,7 +1328,7 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session):
         standard_name="",
         units="W m-2",
     )
-    expected_input_dataset.supplementaries = [
+    expected_required_dataset.supplementaries = [
         Dataset(
             **OBS6_SAT_FACETS,
             short_name="pr",
@@ -1341,10 +1341,11 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session):
             units="kg m-2 s-1",
         ),
     ]
-    expected_input_dataset.session = session
+    expected_required_dataset.session = session
 
-    assert datasets[0].input_datasets == [expected_input_dataset]
-    assert datasets[0].input_datasets[0].files == [lwcre_file]
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == [expected_required_dataset]
+    assert required_datasets[0].files == [lwcre_file]
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1384,7 +1385,7 @@ def test_from_files_with_derived_no_derivation_glob(
     assert datasets[1].files == [lwcre_file]
     assert datasets[1].supplementaries[0].files == [pr_file]
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **{**OBS6_SAT_FACETS, "type": "ground"},
             short_name="lwcre",
@@ -1408,7 +1409,7 @@ def test_from_files_with_derived_no_derivation_glob(
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.supplementaries = [
             Dataset(
                 **OBS6_SAT_FACETS,
@@ -1426,12 +1427,12 @@ def test_from_files_with_derived_no_derivation_glob(
 
     for dataset, expected in zip(
         datasets,
-        expected_input_datasets,
+        expected_required_datasets,
         strict=True,
     ):
-        assert dataset.input_datasets == [expected]
-    assert datasets[0].input_datasets[0].files == [lwcre_file_ground]
-    assert datasets[1].input_datasets[0].files == [lwcre_file]
+        assert dataset.required_datasets == [expected]
+    assert datasets[0].required_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].required_datasets[0].files == [lwcre_file]
 
 
 def test_from_files_with_derived(rlut_file, rlutcs_file, session):
@@ -1450,7 +1451,7 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
     assert datasets[0].files == []
     assert datasets[0].supplementaries[0].files == []
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **OBS6_SAT_FACETS,
             short_name="rlut",
@@ -1474,12 +1475,13 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.session = session
 
-    assert datasets[0].input_datasets == expected_input_datasets
-    assert dataset.input_datasets[0].files == [rlut_file]
-    assert dataset.input_datasets[1].files == [rlutcs_file]
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
 
 
 def test_from_files_with_derived_unavailable_years(
@@ -1507,7 +1509,7 @@ def test_from_files_with_derived_unavailable_years(
     assert datasets == [expected]
     assert datasets[0].files == []
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
             short_name="rlut",
@@ -1531,12 +1533,13 @@ def test_from_files_with_derived_unavailable_years(
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.session = session
 
-    assert datasets[0].input_datasets == expected_input_datasets
-    assert dataset.input_datasets[0].files == []
-    assert dataset.input_datasets[1].files == []
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1569,7 +1572,7 @@ def test_from_files_with_derived_glob(
     assert datasets[0].files == []
     assert datasets[0].supplementaries[0].files == [pr_file]
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **OBS6_SAT_FACETS,
             short_name="rlut",
@@ -1593,15 +1596,16 @@ def test_from_files_with_derived_glob(
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.session = session
 
-    assert datasets[0].input_datasets == expected_input_datasets
-    assert datasets[0].input_datasets[0].files == [rlut_file]
-    assert datasets[0].input_datasets[1].files == [rlutcs_file]
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
 
     log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
-    msg = "Not all necessary input variables to derive 'lwcre' are available"
+    msg = "Not all variables required to derive 'lwcre' are available"
     for log_debug in log_debugs:
         if msg in log_debug:
             break
@@ -1630,7 +1634,7 @@ def test_from_files_with_derived_no_force_derivation(
     assert datasets[0].files == [lwcre_file]
     assert datasets[0].supplementaries[0].files == []
 
-    expected_input_dataset = Dataset(
+    expected_required_dataset = Dataset(
         **OBS6_SAT_FACETS,
         short_name="lwcre",
         derive=True,
@@ -1641,7 +1645,7 @@ def test_from_files_with_derived_no_force_derivation(
         standard_name="",
         units="W m-2",
     )
-    expected_input_dataset.supplementaries = [
+    expected_required_dataset.supplementaries = [
         Dataset(
             **OBS6_SAT_FACETS,
             short_name="pr",
@@ -1654,10 +1658,11 @@ def test_from_files_with_derived_no_force_derivation(
             units="kg m-2 s-1",
         ),
     ]
-    expected_input_dataset.session = session
+    expected_required_dataset.session = session
 
-    assert datasets[0].input_datasets == [expected_input_dataset]
-    assert datasets[0].input_datasets[0].files == [lwcre_file]
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == [expected_required_dataset]
+    assert required_datasets[0].files == [lwcre_file]
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1700,7 +1705,7 @@ def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
     assert datasets[1].files == [lwcre_file]
     assert datasets[1].supplementaries[0].files == [pr_file]
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **{**OBS6_SAT_FACETS, "type": "ground"},
             short_name="lwcre",
@@ -1724,7 +1729,7 @@ def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.supplementaries = [
             Dataset(
                 **OBS6_SAT_FACETS,
@@ -1742,12 +1747,12 @@ def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
 
     for dataset, expected in zip(
         datasets,
-        expected_input_datasets,
+        expected_required_datasets,
         strict=True,
     ):
-        assert dataset.input_datasets == [expected]
-    assert datasets[0].input_datasets[0].files == [lwcre_file_ground]
-    assert datasets[1].input_datasets[0].files == [lwcre_file]
+        assert dataset.required_datasets == [expected]
+    assert datasets[0].required_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].required_datasets[0].files == [lwcre_file]
 
 
 def test_from_files_with_derived_force_derivation(
@@ -1781,7 +1786,7 @@ def test_from_files_with_derived_force_derivation(
     assert datasets[0].files == [lwcre_file]
     assert datasets[0].supplementaries[0].files == []
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **OBS6_SAT_FACETS,
             short_name="rlut",
@@ -1807,12 +1812,13 @@ def test_from_files_with_derived_force_derivation(
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.session = session
 
-    assert datasets[0].input_datasets == expected_input_datasets
-    assert dataset.input_datasets[0].files == [rlut_file]
-    assert dataset.input_datasets[1].files == [rlutcs_file]
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
 
 
 @pytest.mark.parametrize("timerange", ["1980/2000", "*"])
@@ -1853,7 +1859,7 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     assert datasets[0].files == [lwcre_file]
     assert datasets[0].supplementaries[0].files == [pr_file]
 
-    expected_input_datasets = [
+    expected_required_datasets = [
         Dataset(
             **OBS6_SAT_FACETS,
             short_name="rlut",
@@ -1879,15 +1885,16 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
             units="W m-2",
         ),
     ]
-    for expected_ds in expected_input_datasets:
+    for expected_ds in expected_required_datasets:
         expected_ds.session = session
 
-    assert datasets[0].input_datasets == expected_input_datasets
-    assert datasets[0].input_datasets[0].files == [rlut_file]
-    assert datasets[0].input_datasets[1].files == [rlutcs_file]
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
 
     log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
-    msg = "Not all necessary input variables to derive 'lwcre' are available"
+    msg = "Not all variables required to derive 'lwcre' are available"
     for log_debug in log_debugs:
         if msg in log_debug:
             break
@@ -2302,7 +2309,7 @@ def test_set_version_derived_var(monkeypatch, session):
     areacella_file.facets["version"] = "v4"
     dataset.supplementaries[0].files = [areacella_file]
 
-    def _get_input_datasets():
+    def _get_required_datasets():
         rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc")
         rlut_file.facets["version"] = "v1"
         rlut_dataset = Dataset(
@@ -2323,7 +2330,11 @@ def _get_input_datasets():
         rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2]
         return [rlut_dataset, rlutcs_dataset]
 
-    monkeypatch.setattr(dataset, "_get_input_datasets", _get_input_datasets)
+    monkeypatch.setattr(
+        dataset,
+        "_get_required_datasets",
+        _get_required_datasets,
+    )
 
     dataset.set_version()
 
@@ -2978,7 +2989,7 @@ def test_add_derived_supplementary_to_derived():
     assert dataset.supplementaries[0] == expected_supplementary
 
 
-def test_input_datasets_derivation(session):
+def test_required_datasets_derivation(session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.add_supplementary(short_name="pr")
 
@@ -3009,17 +3020,17 @@ def test_input_datasets_derivation(session):
     for expected_dataset in expected_datasets:
         expected_dataset.session = dataset.session
 
-    assert dataset.input_datasets == expected_datasets
+    assert dataset.required_datasets == expected_datasets
 
 
-def test_input_datasets_no_derivation():
+def test_required_datasets_no_derivation():
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
     dataset.add_supplementary(short_name="pr")
 
-    assert dataset.input_datasets == [dataset]
+    assert dataset.required_datasets == [dataset]
 
 
-def test_input_datasets_no_force_derivation(tmp_path, session):
+def test_required_datasets_no_force_derivation(tmp_path, session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.add_supplementary(short_name="pr")
     dataset.session = session
@@ -3031,12 +3042,12 @@ def test_input_datasets_no_force_derivation(tmp_path, session):
     )
     lwcre_file.touch()
 
-    assert dataset.input_datasets == [dataset]
+    assert dataset.required_datasets == [dataset]
 
 
-def test_input_datasets_no_derivation_available(session):
+def test_required_datasets_no_derivation_available(session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True)
 
     msg = r"Cannot derive variable 'tas': no derivation script available"
     with pytest.raises(NotImplementedError, match=msg):
-        dataset.input_datasets  # noqa: B018
+        dataset.required_datasets  # noqa: B018

From de27a4b4c13c50b2fd7037db4e7d0d25e551730d Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 13 Jan 2026 16:26:49 +0100
Subject: [PATCH 72/85] Use bools for facet values of appropriate

---
 esmvalcore/preprocessor/_derive/siextent.py      | 4 ++--
 tests/unit/preprocessor/_derive/test_siextent.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/esmvalcore/preprocessor/_derive/siextent.py b/esmvalcore/preprocessor/_derive/siextent.py
index 27aee25aec..d0beff2cbe 100644
--- a/esmvalcore/preprocessor/_derive/siextent.py
+++ b/esmvalcore/preprocessor/_derive/siextent.py
@@ -20,8 +20,8 @@ class DerivedVariable(DerivedVariableBase):
     def required(project):  # noqa: ARG004
         """Declare the variables needed for derivation."""
         return [
-            {"short_name": "sic", "optional": "true"},
-            {"short_name": "siconca", "optional": "true"},
+            {"short_name": "sic", "optional": True},
+            {"short_name": "siconca", "optional": True},
         ]
 
     @staticmethod
diff --git a/tests/unit/preprocessor/_derive/test_siextent.py b/tests/unit/preprocessor/_derive/test_siextent.py
index ae9f5d1c8f..416c9ac17b 100644
--- a/tests/unit/preprocessor/_derive/test_siextent.py
+++ b/tests/unit/preprocessor/_derive/test_siextent.py
@@ -113,6 +113,6 @@ def test_siextent_required():
     derived_var = siextent.DerivedVariable()
     output = derived_var.required(None)
     assert output == [
-        {"short_name": "sic", "optional": "true"},
-        {"short_name": "siconca", "optional": "true"},
+        {"short_name": "sic", "optional": True},
+        {"short_name": "siconca", "optional": True},
     ]

From d8f5d08a690fc5a7692ecb7a4e774fa9b852234e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 13 Jan 2026 16:39:08 +0100
Subject: [PATCH 73/85] Simplify _get_all_available_datasets

---
 esmvalcore/dataset.py      | 91 +++++++++++++++++++++++---------------
 tests/unit/test_dataset.py | 16 -------
 2 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 9e03c3ddbf..870f046fdc 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -279,7 +279,23 @@ def _file_to_dataset(
         return new_dataset
 
     @staticmethod
-    def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:  # noqa: C901
+    def _get_expanded_globs(
+        dataset_with_globs: Dataset,
+        dataset_with_expanded_globs: Dataset,
+    ) -> tuple[tuple[str, FacetValue], ...]:
+        """Get facets that have been updated by expanding globs."""
+        expanded_globs: dict[str, FacetValue] = {}
+        for key, value in dataset_with_globs.facets.items():
+            if (
+                _isglob(value)
+                and key in dataset_with_expanded_globs.facets
+                and not _isglob(dataset_with_expanded_globs[key])
+            ):
+                expanded_globs[key] = dataset_with_expanded_globs[key]
+        return tuple(expanded_globs.items())
+
+    @staticmethod
+    def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
         """Yield datasets based on the available files.
 
         This function requires that dataset.facets['mip'] is not a glob
@@ -303,43 +319,48 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:  # noqa:
             return
 
         # If forced derivation is requested or no datasets based on files from
-        # dataset have been found, search for datasets based on files from
+        # dataset have been found, search for datasets based on files from the
         # required datasets
         if dataset._is_force_derived() or not datasets_found:
-            all_datasets: list[list[tuple[dict, Dataset]]] = []
+            # Record all expanded globs from first non-optional required
+            # dataset (called "reference_dataset" hereafter)
+            non_optional_datasets = [
+                d
+                for d in dataset.required_datasets
+                if not d.facets.get("optional", False)
+            ]
+            if not non_optional_datasets:
+                msg = (
+                    f"Unable to retrieve available datasets for derived "
+                    f"variable '{dataset.facets['short_name']}', all "
+                    f"variables required for dervation are marked as "
+                    f"'optional'"
+                )
+                raise ValueError(msg)
+            reference_dataset = non_optional_datasets[0]
+            reference_expanded_globs = {
+                Dataset._get_expanded_globs(dataset, ds)
+                for ds in Dataset._get_available_datasets(reference_dataset)
+            }
+
+            # Iterate through all other required datasets and only keep those
+            # expanded globs which are present for all other non-optional
+            # required datasets
             for required_dataset in dataset.required_datasets:
-                all_datasets.append([])
-                for expanded_ds in Dataset._get_available_datasets(
-                    required_dataset,
-                ):
-                    updated_facets = {}
-                    for key, value in dataset.facets.items():
-                        if _isglob(value):
-                            if key in expanded_ds.facets and not _isglob(
-                                expanded_ds[key],
-                            ):
-                                updated_facets[key] = expanded_ds.facets[key]
-                    new_ds = dataset.copy()
-                    new_ds.facets.update(updated_facets)
-                    new_ds.supplementaries = dataset.supplementaries
-
-                    all_datasets[-1].append((updated_facets, new_ds))
-
-            # Only consider those datasets that contain all required variables
-            # with identical facets (e.g., skip those with different
-            # timeranges)
-            for updated_facets, new_ds in all_datasets[0]:
-                other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
-                if all(updated_facets in facets for facets in other_facets):
-                    yield new_ds
-                else:
-                    logger.debug(
-                        "Not all variables required to derive '%s' are "
-                        "available for %s with facets %s",
-                        dataset["short_name"],
-                        new_ds.summary(shorten=True),
-                        updated_facets,
-                    )
+                if required_dataset is reference_dataset:
+                    continue
+                new_expanded_globs = {
+                    Dataset._get_expanded_globs(dataset, ds)
+                    for ds in Dataset._get_available_datasets(required_dataset)
+                }
+                reference_expanded_globs &= new_expanded_globs
+
+            # Use the final expanded globs to create new datasets
+            for expanded_globs in reference_expanded_globs:
+                new_ds = dataset.copy()
+                new_ds.facets.update(expanded_globs)
+                new_ds.supplementaries = dataset.supplementaries
+                yield new_ds
 
     @staticmethod
     def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index e8cd1ca67a..a44b80b7f3 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1604,14 +1604,6 @@ def test_from_files_with_derived_glob(
     assert required_datasets[0].files == [rlut_file]
     assert required_datasets[1].files == [rlutcs_file]
 
-    log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
-    msg = "Not all variables required to derive 'lwcre' are available"
-    for log_debug in log_debugs:
-        if msg in log_debug:
-            break
-    else:
-        pytest.fail(f"No debug message '{msg}'")
-
 
 def test_from_files_with_derived_no_force_derivation(
     lwcre_file,
@@ -1893,14 +1885,6 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     assert required_datasets[0].files == [rlut_file]
     assert required_datasets[1].files == [rlutcs_file]
 
-    log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"]
-    msg = "Not all variables required to derive 'lwcre' are available"
-    for log_debug in log_debugs:
-        if msg in log_debug:
-            break
-    else:
-        pytest.fail(f"No debug message '{msg}'")
-
 
 def test_match():
     dataset1 = Dataset(

From 096248983b44d7a950ed783d81ab071d88e7a671 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 13 Jan 2026 21:44:25 +0100
Subject: [PATCH 74/85] Simplify _get_all_available_datasets

---
 esmvalcore/_recipe/to_datasets.py     | 15 ++---
 esmvalcore/dataset.py                 | 84 +++++++++++----------------
 tests/unit/recipe/test_to_datasets.py |  2 +-
 3 files changed, 38 insertions(+), 63 deletions(-)

diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index e992c767f8..94a38afdbf 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -419,16 +419,13 @@ def _dataset_from_files(dataset: Dataset) -> list[Dataset]:
             dataset.summary(shorten=True),
         )
 
+    # All the magic happens in Dataset.from_files. Here, we simply check if any
+    # wildcards have not been expanded and raise proper errors if necessary.
     for expanded_ds in dataset.from_files():
-        updated_facets = {}
         unexpanded_globs = {}
         for key, value in dataset.facets.items():
             if _isglob(value):
-                if key in expanded_ds.facets and not _isglob(
-                    expanded_ds[key],
-                ):
-                    updated_facets[key] = expanded_ds.facets[key]
-                else:
+                if key not in expanded_ds.facets or _isglob(expanded_ds[key]):
                     unexpanded_globs[key] = value
 
         if unexpanded_globs:
@@ -440,11 +437,7 @@ def _dataset_from_files(dataset: Dataset) -> list[Dataset]:
             errors.append(msg)
             continue
 
-        new_ds = dataset.copy()
-        new_ds.facets.update(updated_facets)
-        new_ds.supplementaries = expanded_ds.supplementaries
-
-        result.append(new_ds)
+        result.append(expanded_ds)
 
     if errors:
         raise RecipeError("\n".join(errors))
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 870f046fdc..50ac2defab 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -183,7 +183,7 @@ def _derivation_necessary(self) -> bool:
         if self._is_force_derived():
             return True
 
-        # Otherwise, derivation is necessary of no files for the self dataset
+        # Otherwise, derivation is necessary if no files for the self dataset
         # are found
         ds_copy = self.copy()
         ds_copy.supplementaries = []
@@ -305,62 +305,44 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
         through variable derivation are returned.
 
         """
-        datasets_found = False
+        if not dataset._derivation_necessary():
+            yield from Dataset._get_available_datasets(dataset)
+            return
 
-        # If no forced derivation is requested, search for datasets based on
-        # files from dataset
-        if not dataset._is_force_derived():
-            for available_ds in Dataset._get_available_datasets(dataset):
-                datasets_found = True
-                yield available_ds
+        # Since we are in full control of the derived variables (the module is
+        # private; no custom derivation functions are possible), we can be sure
+        # that the following list is never empty
+        non_optional_datasets = [
+            d
+            for d in dataset.required_datasets
+            if not d.facets.get("optional", False)
+        ]
 
-        # For variables that cannot be derived, we are done here
-        if not dataset._is_derived():
-            return
+        # Record all expanded globs from first non-optional required dataset
+        # (called "reference_dataset" hereafter)
+        reference_dataset = non_optional_datasets[0]
+        reference_expanded_globs = {
+            Dataset._get_expanded_globs(dataset, ds)
+            for ds in Dataset._get_available_datasets(reference_dataset)
+        }
 
-        # If forced derivation is requested or no datasets based on files from
-        # dataset have been found, search for datasets based on files from the
-        # required datasets
-        if dataset._is_force_derived() or not datasets_found:
-            # Record all expanded globs from first non-optional required
-            # dataset (called "reference_dataset" hereafter)
-            non_optional_datasets = [
-                d
-                for d in dataset.required_datasets
-                if not d.facets.get("optional", False)
-            ]
-            if not non_optional_datasets:
-                msg = (
-                    f"Unable to retrieve available datasets for derived "
-                    f"variable '{dataset.facets['short_name']}', all "
-                    f"variables required for dervation are marked as "
-                    f"'optional'"
-                )
-                raise ValueError(msg)
-            reference_dataset = non_optional_datasets[0]
-            reference_expanded_globs = {
+        # Iterate through all other non-optional required datasets and only
+        # keep those expanded globs which are present for all other
+        # non-optional required datasets
+        for required_dataset in non_optional_datasets:
+            if required_dataset is reference_dataset:
+                continue
+            new_expanded_globs = {
                 Dataset._get_expanded_globs(dataset, ds)
-                for ds in Dataset._get_available_datasets(reference_dataset)
+                for ds in Dataset._get_available_datasets(required_dataset)
             }
+            reference_expanded_globs &= new_expanded_globs
 
-            # Iterate through all other required datasets and only keep those
-            # expanded globs which are present for all other non-optional
-            # required datasets
-            for required_dataset in dataset.required_datasets:
-                if required_dataset is reference_dataset:
-                    continue
-                new_expanded_globs = {
-                    Dataset._get_expanded_globs(dataset, ds)
-                    for ds in Dataset._get_available_datasets(required_dataset)
-                }
-                reference_expanded_globs &= new_expanded_globs
-
-            # Use the final expanded globs to create new datasets
-            for expanded_globs in reference_expanded_globs:
-                new_ds = dataset.copy()
-                new_ds.facets.update(expanded_globs)
-                new_ds.supplementaries = dataset.supplementaries
-                yield new_ds
+        # Use the final expanded globs to create new dataset(s)
+        for expanded_globs in reference_expanded_globs:
+            new_ds = dataset.copy()
+            new_ds.facets.update(expanded_globs)
+            yield new_ds
 
     @staticmethod
     def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py
index 443ec9b80a..d07b4a583d 100644
--- a/tests/unit/recipe/test_to_datasets.py
+++ b/tests/unit/recipe/test_to_datasets.py
@@ -344,7 +344,7 @@ def test_get_required_datasets_optional(caplog, tmp_path, session):
         frequency="mon",
         long_name="Sea-Ice Area Percentage (Atmospheric Grid)",
         modeling_realm=["seaIce"],
-        optional="true",
+        optional=True,
         original_short_name="siconca",
         standard_name="sea_ice_area_fraction",
         tier=2,

From 81da6e7fa06152c486722047f7324c3248ad228e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 14 Jan 2026 09:47:37 +0100
Subject: [PATCH 75/85] Using wildcards for derived variables with only
 optional required variables is not possible

---
 esmvalcore/dataset.py                       |  7 ++
 esmvalcore/preprocessor/_derive/amoc.py     |  4 +-
 esmvalcore/preprocessor/_derive/siextent.py |  4 +-
 tests/unit/test_dataset.py                  | 95 +++++++++++++++++++--
 4 files changed, 97 insertions(+), 13 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 50ac2defab..d722e87ae9 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -317,6 +317,13 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
             for d in dataset.required_datasets
             if not d.facets.get("optional", False)
         ]
+        if not non_optional_datasets:
+            msg = (
+                f"Using wildcards to derive {dataset.summary(shorten=True)} "
+                f"is not possible, derivation function only requires optional "
+                f"variables"
+            )
+            raise RecipeError(msg)
 
         # Record all expanded globs from first non-optional required dataset
         # (called "reference_dataset" hereafter)
diff --git a/esmvalcore/preprocessor/_derive/amoc.py b/esmvalcore/preprocessor/_derive/amoc.py
index 3607aa1d62..67b179f0dd 100644
--- a/esmvalcore/preprocessor/_derive/amoc.py
+++ b/esmvalcore/preprocessor/_derive/amoc.py
@@ -72,9 +72,7 @@ def calculate(cubes):
                 f"Amoc calculation: {cube_orig} doesn't contain"
                 f" atlantic_arctic_ocean."
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
 
         # 2: Remove the shallowest 500m to avoid wind driven mixed layer.
         depth_constraint = iris.Constraint(depth=lambda d: d >= 500.0)
diff --git a/esmvalcore/preprocessor/_derive/siextent.py b/esmvalcore/preprocessor/_derive/siextent.py
index d0beff2cbe..5bd2ca82f1 100644
--- a/esmvalcore/preprocessor/_derive/siextent.py
+++ b/esmvalcore/preprocessor/_derive/siextent.py
@@ -53,9 +53,7 @@ def calculate(cubes):
                     "Derivation of siextent failed due to missing variables "
                     "sic and siconca."
                 )
-                raise RecipeError(
-                    msg,
-                ) from exc
+                raise RecipeError(msg) from exc
 
         ones = da.ones_like(sic)
         siextent_data = da.ma.masked_where(sic.lazy_data() < 15.0, ones)
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index a44b80b7f3..f27307e7d0 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import logging
 import textwrap
 from collections import defaultdict
 from pathlib import Path
@@ -1301,6 +1300,18 @@ def pr_file(tmp_path):
     return pr
 
 
+@pytest.fixture
+def siconca_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
+
+
 def test_from_files_with_derived_no_derivation(lwcre_file, session):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
@@ -1550,7 +1561,6 @@ def test_from_files_with_derived_glob(
     rlutcs_file,
     pr_file,
     session,
-    caplog,
 ):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(
@@ -1561,8 +1571,7 @@ def test_from_files_with_derived_glob(
     dataset.add_supplementary(short_name="pr")
     dataset.session = session
 
-    with caplog.at_level(logging.DEBUG):
-        datasets = list(dataset.from_files())
+    datasets = list(dataset.from_files())
 
     expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     expected.add_supplementary(short_name="pr")
@@ -1823,7 +1832,6 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     rlutcs_file,
     pr_file,
     session,
-    caplog,
 ):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(
@@ -1835,8 +1843,7 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     dataset.add_supplementary(short_name="pr")
     dataset.session = session
 
-    with caplog.at_level(logging.DEBUG):
-        datasets = list(dataset.from_files())
+    datasets = list(dataset.from_files())
 
     expected = Dataset(
         **OBS6_SAT_FACETS,
@@ -1886,6 +1893,80 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     assert required_datasets[1].files == [rlutcs_file]
 
 
+def test_from_files_with_derived_only_optional(siconca_file, pr_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "mip": "SImon"},
+        short_name="siextent",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr", mip="Amon")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "mip": "SImon"},
+        short_name="siextent",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr", mip="Amon")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "mip": "SImon"},
+            short_name="sic",
+            derive=False,
+            frequency="mon",
+            long_name="Sea-Ice Area Percentage (Ocean Grid)",
+            modeling_realm=["seaIce"],
+            original_short_name="siconc",
+            standard_name="sea_ice_area_fraction",
+            units="%",
+            optional=True,
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "mip": "SImon"},
+            short_name="siconca",
+            derive=False,
+            frequency="mon",
+            long_name="Sea-Ice Area Percentage (Atmospheric Grid)",
+            modeling_realm=["seaIce"],
+            original_short_name="siconca",
+            standard_name="sea_ice_area_fraction",
+            units="%",
+            optional=True,
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == [siconca_file]
+
+
+def test_from_files_with_derived_only_optional_glob_fail(session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "mip": "SImon", "type": "*"},
+        short_name="siextent",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr", mip="Amon")
+    dataset.session = session
+
+    msg = r"Using wildcards to derive .* is not possible"
+    with pytest.raises(RecipeError, match=msg):
+        next(dataset.from_files())
+
+
 def test_match():
     dataset1 = Dataset(
         short_name="areacella",

From ade0bced772c87fcce81d9ced4c1e23abf031fdc Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 14 Jan 2026 10:06:12 +0100
Subject: [PATCH 76/85] Explicitly cast tuple[tuple] to dict

---
 esmvalcore/_recipe/to_datasets.py | 2 +-
 esmvalcore/dataset.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 94a38afdbf..50b3364768 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -419,7 +419,7 @@ def _dataset_from_files(dataset: Dataset) -> list[Dataset]:
             dataset.summary(shorten=True),
         )
 
-    # All the magic happens in Dataset.from_files. Here, we simply check if any
+    # The magic happens in Dataset.from_files. Here, we simply check if any
     # wildcards have not been expanded and raise proper errors if necessary.
     for expanded_ds in dataset.from_files():
         unexpanded_globs = {}
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index d722e87ae9..3413c6601f 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -348,7 +348,7 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
         # Use the final expanded globs to create new dataset(s)
         for expanded_globs in reference_expanded_globs:
             new_ds = dataset.copy()
-            new_ds.facets.update(expanded_globs)
+            new_ds.facets.update(dict(expanded_globs))
             yield new_ds
 
     @staticmethod

From d725654c33ad0a94c5f5cc703a14d28af7647b15 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 14 Jan 2026 10:31:49 +0100
Subject: [PATCH 77/85] Do not return any files for required variables if no
 facets match at all

---
 esmvalcore/dataset.py      | 61 ++++++++++++++++++++++----------------
 tests/unit/test_dataset.py | 57 +++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 25 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 3413c6601f..44bff90bf3 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -451,35 +451,46 @@ def from_files(self) -> Iterator[Dataset]:
         Dataset
             Datasets representing the available files.
         """
-        expanded = False
-        if any(_isglob(v) for v in self.facets.values()):
-            if _isglob(self.facets["mip"]):
-                available_mips = _get_mips(
-                    self.facets["project"],  # type: ignore
-                    self.facets["short_name"],  # type: ignore
-                )
-                mips = [
-                    mip
-                    for mip in available_mips
-                    if _ismatch(mip, self.facets["mip"])
-                ]
-            else:
-                mips = [self.facets["mip"]]  # type: ignore
+        # No wildcards present -> simply return self with expanded
+        # supplementaries
+        if not any(_isglob(v) for v in self.facets.values()):
+            self._supplementaries_from_files()
+            yield self
+            return
 
-            for mip in mips:
-                dataset_template = self.copy(mip=mip)
-                for dataset in self._get_all_available_datasets(
-                    dataset_template,
-                ):
-                    dataset._supplementaries_from_files()  # noqa: SLF001
-                    expanded = True
-                    yield dataset
+        # Wildcards present -> expand them
+        expanded = False
+        if _isglob(self.facets["mip"]):
+            available_mips = _get_mips(
+                self.facets["project"],  # type: ignore
+                self.facets["short_name"],  # type: ignore
+            )
+            mips = [
+                mip
+                for mip in available_mips
+                if _ismatch(mip, self.facets["mip"])
+            ]
+        else:
+            mips = [self.facets["mip"]]  # type: ignore
 
+        for mip in mips:
+            dataset_template = self.copy(mip=mip)
+            for dataset in self._get_all_available_datasets(
+                dataset_template,
+            ):
+                dataset._supplementaries_from_files()  # noqa: SLF001
+                expanded = True
+                yield dataset
+
+        # If files were found, or the file facets didn't match the
+        # specification, yield the original, but do expand any supplementary
+        # globs. For derived variables, make sure to purge any files found for
+        # required variables; those won't match in their facets.
         if not expanded:
-            # If the definition contains no wildcards, no files were found,
-            # or the file facets didn't match the specification, yield the
-            # original, but do expand any supplementary globs.
             self._supplementaries_from_files()
+            if self._derivation_necessary():
+                for required_dataset in self.required_datasets:
+                    required_dataset.files = []
             yield self
 
     def _supplementaries_from_files(self) -> None:
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index f27307e7d0..ee46759a73 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1614,6 +1614,63 @@ def test_from_files_with_derived_glob(
     assert required_datasets[1].files == [rlutcs_file]
 
 
+def test_from_files_with_derived_glob_differing_timerange(
+    rlut_file_future,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    expected.session = session
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "*"},
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "*"},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
+
+
 def test_from_files_with_derived_no_force_derivation(
     lwcre_file,
     rlut_file,

From d5234a750160c7cf1a3e6c06b9bdba4b6e78b4bd Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 14 Jan 2026 14:35:11 +0100
Subject: [PATCH 78/85] Add supplementaries to required datasets

---
 esmvalcore/dataset.py      |  2 +
 tests/unit/test_dataset.py | 80 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 44bff90bf3..951565c817 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -205,6 +205,8 @@ def _get_required_datasets(self) -> list[Dataset]:
 
         for required_facets in required_vars_facets:
             required_dataset = self._copy(derive=False, force_derivation=False)
+            for supplementary in self.supplementaries:
+                required_dataset.supplementaries.append(supplementary.copy())
             keep = {"alias", "recipe_dataset_index", *self.minimal_facets}
             required_dataset.facets = {
                 k: v for k, v in required_dataset.facets.items() if k in keep
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index ee46759a73..ee9f125782 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1487,6 +1487,19 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
         ),
     ]
     for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -1606,6 +1619,19 @@ def test_from_files_with_derived_glob(
         ),
     ]
     for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -1871,6 +1897,20 @@ def test_from_files_with_derived_force_derivation(
         ),
     ]
     for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                force_derivation=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -1942,6 +1982,20 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
         ),
     ]
     for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                force_derivation=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2001,6 +2055,19 @@ def test_from_files_with_derived_only_optional(siconca_file, pr_file, session):
         ),
     ]
     for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -3140,6 +3207,19 @@ def test_required_datasets_derivation(session):
         ),
     ]
     for expected_dataset in expected_datasets:
+        expected_dataset.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_dataset.session = dataset.session
 
     assert dataset.required_datasets == expected_datasets

From 2226ebd65d5ba411cf86e2cf63568f13cb13be7a Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 14 Jan 2026 17:55:10 +0100
Subject: [PATCH 79/85] Add test cases for derived variables with optional
 variable

---
 tests/unit/test_dataset.py | 297 +++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)

diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index ee9f125782..352ad46e4d 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -16,6 +16,7 @@
 from esmvalcore.dataset import Dataset
 from esmvalcore.exceptions import InputFilesNotFound, RecipeError
 from esmvalcore.io.esgf import ESGFFile
+from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase
 
 if TYPE_CHECKING:
     from esmvalcore.typing import Facets
@@ -1643,6 +1644,7 @@ def test_from_files_with_derived_glob(
 def test_from_files_with_derived_glob_differing_timerange(
     rlut_file_future,
     rlutcs_file,
+    pr_file,
     session,
 ):
     """Test `from_files` with derived variable and supplementary."""
@@ -1651,6 +1653,7 @@ def test_from_files_with_derived_glob_differing_timerange(
         short_name="lwcre",
         derive=True,
     )
+    dataset.add_supplementary(short_name="pr")
     dataset.session = session
 
     datasets = list(dataset.from_files())
@@ -1660,6 +1663,7 @@ def test_from_files_with_derived_glob_differing_timerange(
         short_name="lwcre",
         derive=True,
     )
+    expected.add_supplementary(short_name="pr", timerange="1980/2000")
     expected.session = session
     assert datasets == [expected]
     assert datasets[0].files == []
@@ -1689,6 +1693,19 @@ def test_from_files_with_derived_glob_differing_timerange(
         ),
     ]
     for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2004,6 +2021,286 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
     assert required_datasets[1].files == [rlutcs_file]
 
 
+class DerivedVariable(DerivedVariableBase):
+    """Derivation of dummy variable."""
+
+    @staticmethod
+    def required(project):
+        """Declare the variables needed for derivation."""
+        return [
+            {"short_name": "rlut", "optional": True},
+            {"short_name": "rlutcs"},
+            {"short_name": "pr"},
+        ]
+
+
+def test_from_files_with_derived_optional(
+    monkeypatch,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    monkeypatch.setattr(
+        esmvalcore.preprocessor._derive,
+        "ALL_DERIVED_VARIABLES",
+        {"tas": DerivedVariable},
+    )
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            optional=True,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == [rlutcs_file]
+    assert required_datasets[2].files == [pr_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_glob_optional(
+    timerange,
+    monkeypatch,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    monkeypatch.setattr(
+        esmvalcore.preprocessor._derive,
+        "ALL_DERIVED_VARIABLES",
+        {"tas": DerivedVariable},
+    )
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="tas",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            optional=True,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == [rlutcs_file]
+    assert required_datasets[2].files == [pr_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_glob_optional_missing(
+    timerange,
+    monkeypatch,
+    rlut_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    monkeypatch.setattr(
+        esmvalcore.preprocessor._derive,
+        "ALL_DERIVED_VARIABLES",
+        {"tas": DerivedVariable},
+    )
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="tas",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="tas",
+        derive=True,
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+            short_name="rlut",
+            derive=False,
+            optional=True,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
+    assert required_datasets[2].files == []
+
+
 def test_from_files_with_derived_only_optional(siconca_file, pr_file, session):
     """Test `from_files` with derived variable and supplementary."""
     dataset = Dataset(

From bb72b6ac049092286d8b91c6ecf193f69780f0e3 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <32543114+schlunma@users.noreply.github.com>
Date: Wed, 14 Jan 2026 22:03:30 +0100
Subject: [PATCH 80/85] Update esmvalcore/dataset.py

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 esmvalcore/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 951565c817..0266850fcf 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -221,7 +221,7 @@ def _get_required_datasets(self) -> list[Dataset]:
     def required_datasets(self) -> list[Dataset]:
         """Get required datasets.
 
-        For non-derived variables (i.e., those with facet ``derive=False``),
+        For non-derived variables (i.e., those without a ``derive`` facet or with facet ``derive=False``),
         this will simply return the dataset itself in a list.
 
         For derived variables (i.e., those with facet ``derive=True``), this

From 8dd2fdf5823672569359f8d555b0b9d9b533ed49 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Wed, 14 Jan 2026 22:04:38 +0100
Subject: [PATCH 81/85] Fix indentation

---
 esmvalcore/dataset.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 0266850fcf..5555f77c70 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -221,8 +221,9 @@ def _get_required_datasets(self) -> list[Dataset]:
     def required_datasets(self) -> list[Dataset]:
         """Get required datasets.
 
-        For non-derived variables (i.e., those without a ``derive`` facet or with facet ``derive=False``),
-        this will simply return the dataset itself in a list.
+        For non-derived variables (i.e., those without a ``derive`` facet or
+        with facet ``derive=False``), this will simply return the dataset
+        itself in a list.
 
         For derived variables (i.e., those with facet ``derive=True``), this
         will return the datasets required for derivation if derivation is

From 9b28c0f1a6591f6ce54c7dbad5c12842d55bbfc2 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Fri, 16 Jan 2026 22:14:54 +0100
Subject: [PATCH 82/85] FIrst update of notebook

---
 notebooks/discovering-data.ipynb | 180 ++++++++++++++++++-------------
 1 file changed, 107 insertions(+), 73 deletions(-)

diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb
index 581e8ca249..e43df8b34e 100644
--- a/notebooks/discovering-data.ipynb
+++ b/notebooks/discovering-data.ipynb
@@ -13,14 +13,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "f0ccfe7f-c535-4606-99ce-be24960aece1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ERROR 1: PROJ: proj_create_from_database: Open of /home/manuel/micromamba/envs/esm/share/proj failed\n"
+     ]
+    }
+   ],
    "source": [
     "from esmvalcore.config import CFG\n",
-    "from esmvalcore.dataset import Dataset\n",
-    "from esmvalcore.esgf import download"
+    "from esmvalcore.dataset import Dataset"
    ]
   },
   {
@@ -39,7 +46,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CFG[\"search_esgf\"] = \"always\""
+    "CFG[\"search_data\"] = \"complete\"\n",
+    "CFG.nested_update(\n",
+    "    {\n",
+    "        \"projects\": {\n",
+    "            \"CMIP6\": {\n",
+    "                \"data\": {\n",
+    "                    \"intake-esgf\": {\n",
+    "                        \"type\": \"esmvalcore.io.intake_esgf.IntakeESGFDataSource\",\n",
+    "                        \"priority\": 2,\n",
+    "                        \"facets\": {\n",
+    "                            \"activity\": \"activity_drs\",\n",
+    "                            \"dataset\": \"source_id\",\n",
+    "                            \"ensemble\": \"member_id\",\n",
+    "                            \"exp\": \"experiment_id\",\n",
+    "                            \"institute\": \"institution_id\",\n",
+    "                            \"grid\": \"grid_label\",\n",
+    "                            \"mip\": \"table_id\",\n",
+    "                            \"project\": \"project\",\n",
+    "                            \"short_name\": \"variable_id\",\n",
+    "                        },\n",
+    "                    },\n",
+    "                },\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    ")"
    ]
   },
   {
@@ -89,7 +121,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 727 datasets, showing the first 10:\n"
+      "Found 906 datasets, showing the first 10:\n"
      ]
     },
     {
@@ -168,20 +200,20 @@
        "  'grid': 'gn',\n",
        "  'institute': 'AWI'},\n",
        " Dataset:\n",
-       " {'dataset': 'AWI-ESM-1-REcoM',\n",
+       " {'dataset': 'BCC-CSM2-MR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'tas',\n",
        "  'ensemble': 'r1i1p1f1',\n",
        "  'exp': 'historical',\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'AWI'},\n",
+       "  'institute': 'BCC'},\n",
        " Dataset:\n",
        " {'dataset': 'BCC-CSM2-MR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'tas',\n",
-       "  'ensemble': 'r1i1p1f1',\n",
+       "  'ensemble': 'r2i1p1f1',\n",
        "  'exp': 'historical',\n",
        "  'grid': 'gn',\n",
        "  'institute': 'BCC'}]"
@@ -253,7 +285,7 @@
     {
      "data": {
       "text/plain": [
-       "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]"
+       "[IntakeESGFDataset(name='CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn')]"
       ]
      },
      "execution_count": 6,
@@ -270,7 +302,7 @@
    "id": "60d88a34-c886-4b9d-a9e9-a9d18fa97917",
    "metadata": {},
    "source": [
-    "A single file can be downloaded using its `download` method:"
+    "Load a single file as `iris.cube.CubeList`:"
    ]
   },
   {
@@ -280,37 +312,44 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Exception ignored in: <function tqdm.__del__ at 0x7f31abafa020>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n",
+      "    self.close()\n",
+      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n",
+      "    self.disp(bar_style='danger', check_delay=False)\n",
+      "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n",
+      "Exception ignored in: <function tqdm.__del__ at 0x7f31abafa020>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n",
+      "    self.close()\n",
+      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n",
+      "    self.disp(bar_style='danger', check_delay=False)\n",
+      "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n",
+      "/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:712: UserWarning: We could not download your entire catalog, missed={'CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn'}\n",
+      "  warnings.warn(f\"We could not download your entire catalog, {missed=}\")\n"
+     ]
+    },
+    {
+     "ename": "DatasetLoadError",
+     "evalue": "We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mDatasetLoadError\u001b[39m                          Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m cubes = \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_iris\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:163\u001b[39m, in \u001b[36mIntakeESGFDataset.to_iris\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    155\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mto_iris\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> iris.cube.CubeList:\n\u001b[32m    156\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"Load the data as Iris cubes.\u001b[39;00m\n\u001b[32m    157\u001b[39m \n\u001b[32m    158\u001b[39m \u001b[33;03m    Returns\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    161\u001b[39m \u001b[33;03m        The loaded data.\u001b[39;00m\n\u001b[32m    162\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m163\u001b[39m     files = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcatalog\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    164\u001b[39m \u001b[43m        \u001b[49m\u001b[43mminimal_keys\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m    165\u001b[39m \u001b[43m        \u001b[49m\u001b[43mquiet\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m    166\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m    167\u001b[39m     dataset = \u001b[38;5;28mself\u001b[39m.catalog.to_dataset_dict(\n\u001b[32m    168\u001b[39m         minimal_keys=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m    169\u001b[39m         add_measures=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m    170\u001b[39m         quiet=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m    171\u001b[39m     )[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m    172\u001b[39m     \u001b[38;5;66;03m# Store the local paths in the attributes for easier debugging.\u001b[39;00m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:96\u001b[39m, in \u001b[36m_CachingCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m     94\u001b[39m key = \u001b[38;5;28mtuple\u001b[39m((k, v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m kwargs.items() \u001b[38;5;28;01mif\u001b[39;00m k != \u001b[33m\"\u001b[39m\u001b[33mquiet\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result:\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m     \u001b[38;5;28mself\u001b[39m._result[key] = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result[key]\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:714\u001b[39m, in \u001b[36mESGFCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m    712\u001b[39m     warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mWe could not download your entire catalog, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmissed\u001b[38;5;132;01m=}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    713\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mbreak_on_error\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m--> \u001b[39m\u001b[32m714\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m DatasetLoadError(\n\u001b[32m    715\u001b[39m             \u001b[38;5;28mlist\u001b[39m(missed),\n\u001b[32m    716\u001b[39m             \u001b[38;5;28mself\u001b[39m.logger.read()\n\u001b[32m    717\u001b[39m             \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mprint_log_on_error\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m    718\u001b[39m             \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m    719\u001b[39m         )\n\u001b[32m    721\u001b[39m \u001b[38;5;66;03m# optionally simplify the keys\u001b[39;00m\n\u001b[32m    722\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m minimal_keys:\n",
+      "\u001b[31mDatasetLoadError\u001b[39m: We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'."
+     ]
     }
    ],
    "source": [
-    "dataset.files[0].download(CFG[\"download_dir\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "3821b594-3797-497b-a51d-1798d5b2fc80",
-   "metadata": {},
-   "source": [
-    "For downloading many files, the [esmvalcore.esgf.download](https://docs.esmvaltool.org/projects/esmvalcore/en/latest/api/esmvalcore.esgf.html#esmvalcore.esgf.download) function is recommended because it will download the files in parallel. The ESMValCore will try to guess the fastest host and download from there. If it is not available for some reason, it will automatically fall back to the next host."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "9676ff81-232e-4ff8-b784-686f0d06c469",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "download(dataset.files, CFG[\"download_dir\"])"
+    "cubes = dataset.files[0].to_iris()"
    ]
   },
   {
@@ -323,7 +362,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "id": "b75314e3",
    "metadata": {},
    "outputs": [],
@@ -344,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "b87c247f",
    "metadata": {},
    "outputs": [
@@ -352,14 +391,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 36 datasets, showing the first 10:\n"
+      "Found 38 datasets, showing the first 10:\n"
      ]
     },
     {
      "data": {
       "text/plain": [
        "[Dataset:\n",
-       " {'dataset': 'TaiESM1',\n",
+       " {'dataset': 'SAM0-UNICON',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -368,9 +407,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'AS-RCEC'},\n",
+       "  'institute': 'SNU'},\n",
        " Dataset:\n",
-       " {'dataset': 'AWI-CM-1-1-MR',\n",
+       " {'dataset': 'AWI-ESM-1-1-LR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -381,7 +420,7 @@
        "  'grid': 'gn',\n",
        "  'institute': 'AWI'},\n",
        " Dataset:\n",
-       " {'dataset': 'AWI-ESM-1-1-LR',\n",
+       " {'dataset': 'CMCC-CM2-HR4',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -390,9 +429,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'AWI'},\n",
+       "  'institute': 'CMCC'},\n",
        " Dataset:\n",
-       " {'dataset': 'AWI-ESM-1-REcoM',\n",
+       " {'dataset': 'CESM2-WACCM',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -401,9 +440,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'AWI'},\n",
+       "  'institute': 'NCAR'},\n",
        " Dataset:\n",
-       " {'dataset': 'BCC-CSM2-MR',\n",
+       " {'dataset': 'BCC-ESM1',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -414,7 +453,7 @@
        "  'grid': 'gn',\n",
        "  'institute': 'BCC'},\n",
        " Dataset:\n",
-       " {'dataset': 'BCC-ESM1',\n",
+       " {'dataset': 'GISS-E2-1-H',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -423,9 +462,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'BCC'},\n",
+       "  'institute': 'NASA-GISS'},\n",
        " Dataset:\n",
-       " {'dataset': 'CAMS-CSM1-0',\n",
+       " {'dataset': 'MRI-ESM2-0',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -434,9 +473,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'CAMS'},\n",
+       "  'institute': 'MRI'},\n",
        " Dataset:\n",
-       " {'dataset': 'CAS-ESM2-0',\n",
+       " {'dataset': 'TaiESM1',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -445,9 +484,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'CAS'},\n",
+       "  'institute': 'AS-RCEC'},\n",
        " Dataset:\n",
-       " {'dataset': 'FGOALS-g3',\n",
+       " {'dataset': 'CAMS-CSM1-0',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -456,9 +495,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'CAS'},\n",
+       "  'institute': 'CAMS'},\n",
        " Dataset:\n",
-       " {'dataset': 'IITM-ESM',\n",
+       " {'dataset': 'MPI-ESM-1-2-HAM',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -467,10 +506,10 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'CCCR-IITM'}]"
+       "  'institute': 'HAMMOZ-Consortium'}]"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -499,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "c5edfa65",
    "metadata": {},
    "outputs": [
@@ -509,7 +548,7 @@
        "[]"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -530,14 +569,14 @@
      "output_type": "stream",
      "text": [
       "rlut\n",
-      "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n",
+      "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlut.gn')]\n",
       "rlutcs\n",
-      "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n"
+      "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlutcs.gn')]\n"
      ]
     }
    ],
    "source": [
-    "for d in dataset.input_datasets:\n",
+    "for d in dataset.required_datasets:\n",
     "    print(d[\"short_name\"])\n",
     "    print(d.files)"
    ]
@@ -545,7 +584,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "esm",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -559,12 +598,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "17e81e49408864327be43d3caebcb8eca32ff92a01becb15aa27be73c37f0517"
-   }
+   "version": "3.13.11"
   }
  },
  "nbformat": 4,

From 6d8ba227d205fdcfd0538e1c93e973593ab490d4 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 19 Jan 2026 10:46:25 +0100
Subject: [PATCH 83/85] Update example notebook

---
 notebooks/discovering-data.ipynb | 498 ++++++++++++++++++++++++++-----
 1 file changed, 430 insertions(+), 68 deletions(-)

diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb
index e43df8b34e..10676a8b72 100644
--- a/notebooks/discovering-data.ipynb
+++ b/notebooks/discovering-data.ipynb
@@ -13,18 +13,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "f0ccfe7f-c535-4606-99ce-be24960aece1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "ERROR 1: PROJ: proj_create_from_database: Open of /home/manuel/micromamba/envs/esm/share/proj failed\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from esmvalcore.config import CFG\n",
     "from esmvalcore.dataset import Dataset"
@@ -47,6 +39,7 @@
    "outputs": [],
    "source": [
     "CFG[\"search_data\"] = \"complete\"\n",
+    "CFG[\"projects\"].pop(\"CMIP6\", None)  # Clear existing CMIP6 configuration\n",
     "CFG.nested_update(\n",
     "    {\n",
     "        \"projects\": {\n",
@@ -312,44 +305,413 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Exception ignored in: <function tqdm.__del__ at 0x7f31abafa020>\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n",
-      "    self.close()\n",
-      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n",
-      "    self.disp(bar_style='danger', check_delay=False)\n",
-      "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n",
-      "Exception ignored in: <function tqdm.__del__ at 0x7f31abafa020>\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n",
-      "    self.close()\n",
-      "  File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n",
-      "    self.disp(bar_style='danger', check_delay=False)\n",
-      "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n",
-      "/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:712: UserWarning: We could not download your entire catalog, missed={'CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn'}\n",
-      "  warnings.warn(f\"We could not download your entire catalog, {missed=}\")\n"
-     ]
-    },
-    {
-     "ename": "DatasetLoadError",
-     "evalue": "We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mDatasetLoadError\u001b[39m                          Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m cubes = \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_iris\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:163\u001b[39m, in \u001b[36mIntakeESGFDataset.to_iris\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    155\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mto_iris\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> iris.cube.CubeList:\n\u001b[32m    156\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"Load the data as Iris cubes.\u001b[39;00m\n\u001b[32m    157\u001b[39m \n\u001b[32m    158\u001b[39m \u001b[33;03m    Returns\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    161\u001b[39m \u001b[33;03m        The loaded data.\u001b[39;00m\n\u001b[32m    162\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m163\u001b[39m     files = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcatalog\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    164\u001b[39m \u001b[43m        \u001b[49m\u001b[43mminimal_keys\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m    165\u001b[39m \u001b[43m        \u001b[49m\u001b[43mquiet\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m    166\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m    167\u001b[39m     dataset = \u001b[38;5;28mself\u001b[39m.catalog.to_dataset_dict(\n\u001b[32m    168\u001b[39m         minimal_keys=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m    169\u001b[39m         add_measures=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m    170\u001b[39m         quiet=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m    171\u001b[39m     )[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m    172\u001b[39m     \u001b[38;5;66;03m# Store the local paths in the attributes for easier debugging.\u001b[39;00m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:96\u001b[39m, in \u001b[36m_CachingCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m     94\u001b[39m key = \u001b[38;5;28mtuple\u001b[39m((k, v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m kwargs.items() \u001b[38;5;28;01mif\u001b[39;00m k != \u001b[33m\"\u001b[39m\u001b[33mquiet\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result:\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m     \u001b[38;5;28mself\u001b[39m._result[key] = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result[key]\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:714\u001b[39m, in \u001b[36mESGFCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m    712\u001b[39m     warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mWe could not download your entire catalog, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmissed\u001b[38;5;132;01m=}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    713\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mbreak_on_error\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m--> \u001b[39m\u001b[32m714\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m DatasetLoadError(\n\u001b[32m    715\u001b[39m             \u001b[38;5;28mlist\u001b[39m(missed),\n\u001b[32m    716\u001b[39m             \u001b[38;5;28mself\u001b[39m.logger.read()\n\u001b[32m    717\u001b[39m             \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mprint_log_on_error\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m    718\u001b[39m             \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m    719\u001b[39m         )\n\u001b[32m    721\u001b[39m \u001b[38;5;66;03m# optionally simplify the keys\u001b[39;00m\n\u001b[32m    722\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m minimal_keys:\n",
-      "\u001b[31mDatasetLoadError\u001b[39m: We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'."
-     ]
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "    .accordion-140017080135872 {\n",
+       "        color: var(--jp-ui-font-color2);\n",
+       "        background: var(--jp-layout-color2);\n",
+       "        cursor: pointer;\n",
+       "        padding: 10px;\n",
+       "        border: 1px solid var(--jp-border-color0);\n",
+       "        width: 100%;\n",
+       "        text-align: left;\n",
+       "        font-size: 14px;\n",
+       "        font-family: var(--jp-code-font-family);\n",
+       "        font-weight: normal;\n",
+       "        outline: none;\n",
+       "        transition: 0.4s;\n",
+       "    }\n",
+       "    .active {\n",
+       "        background: var(--jp-layout-color1);\n",
+       "        font-weight: 900;\n",
+       "    }\n",
+       "    .accordion-140017080135872.active {\n",
+       "        border: 1px solid var(--jp-brand-color1) !important;\n",
+       "    }\n",
+       "    .accordion-140017080135872:hover {\n",
+       "        box-shadow: var(--jp-input-box-shadow);\n",
+       "        border: 2px solid var(--jp-brand-color1);\n",
+       "    }\n",
+       "    .panel-140017080135872 {\n",
+       "        padding: 0 18px;\n",
+       "        margin-bottom: 5px;\n",
+       "        background-color: var(--jp-layout-color1);\n",
+       "        display: none;\n",
+       "        overflow: hidden;\n",
+       "        border: 1px solid var(--jp-brand-color2);\n",
+       "    }\n",
+       "</style>\n",
+       "<script type=\"text/javascript\">\n",
+       "    var accordion = document.getElementsByClassName(\"accordion-140017080135872\");\n",
+       "    var i;\n",
+       "\n",
+       "    for (i = 0; i < accordion.length; i++) {\n",
+       "        accordion[i].addEventListener(\"click\", function() {\n",
+       "            this.classList.toggle(\"active\");\n",
+       "\n",
+       "            var panel = this.nextElementSibling;\n",
+       "            if (panel.style.display === \"block\") {\n",
+       "                panel.style.display = \"none\";\n",
+       "            } else {\n",
+       "                panel.style.display = \"block\";\n",
+       "            }\n",
+       "        });\n",
+       "    }\n",
+       "</script>\n",
+       "\n",
+       "<button class=\"accordion-140017080135872\">0: air_temperature / (K)               (time: 1980; latitude: 192; longitude: 288)</button>\n",
+       "<div class=\"panel-140017080135872\">\n",
+       "    <p>\n",
+       "<style>\n",
+       "  a.iris {\n",
+       "      text-decoration: none !important;\n",
+       "  }\n",
+       "  table.iris {\n",
+       "      white-space: pre;\n",
+       "      border: 1px solid;\n",
+       "      border-color: #9c9c9c;\n",
+       "      font-family: monaco, monospace;\n",
+       "  }\n",
+       "  th.iris {\n",
+       "      background: #303f3f;\n",
+       "      color: #e0e0e0;\n",
+       "      border-left: 1px solid;\n",
+       "      border-color: #9c9c9c;\n",
+       "      font-size: 1.05em;\n",
+       "      min-width: 50px;\n",
+       "      max-width: 125px;\n",
+       "  }\n",
+       "  tr.iris :first-child {\n",
+       "      border-right: 1px solid #9c9c9c !important;\n",
+       "  }\n",
+       "  td.iris-title {\n",
+       "      background: #d5dcdf;\n",
+       "      border-top: 1px solid #9c9c9c;\n",
+       "      font-weight: bold;\n",
+       "  }\n",
+       "  .iris-word-cell {\n",
+       "      text-align: left !important;\n",
+       "      white-space: pre;\n",
+       "  }\n",
+       "  .iris-subheading-cell {\n",
+       "      padding-left: 2em !important;\n",
+       "  }\n",
+       "  .iris-inclusion-cell {\n",
+       "      padding-right: 1em !important;\n",
+       "  }\n",
+       "  .iris-panel-body {\n",
+       "      padding-top: 0px;\n",
+       "  }\n",
+       "  .iris-panel-title {\n",
+       "      padding-left: 3em;\n",
+       "  }\n",
+       "  .iris-panel-title {\n",
+       "      margin-top: 7px;\n",
+       "  }\n",
+       "</style>\n",
+       "<table class=\"iris\" id=\"140016978747280\">\n",
+       "    <tr class=\"iris\">\n",
+       "<th class=\"iris iris-word-cell\">Air Temperature (K)</th>\n",
+       "<th class=\"iris iris-word-cell\">time</th>\n",
+       "<th class=\"iris iris-word-cell\">latitude</th>\n",
+       "<th class=\"iris iris-word-cell\">longitude</th>\n",
+       "</tr>\n",
+       "    <tr class=\"iris\">\n",
+       "<td class=\"iris-word-cell iris-subheading-cell\">Shape</td>\n",
+       "<td class=\"iris iris-inclusion-cell\">1980</td>\n",
+       "<td class=\"iris iris-inclusion-cell\">192</td>\n",
+       "<td class=\"iris iris-inclusion-cell\">288</td>\n",
+       "</tr>\n",
+       "    <tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Dimension coordinates</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttime</td>\n",
+       "    <td class=\"iris-inclusion-cell\">x</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tlatitude</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">x</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tlongitude</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">x</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Scalar coordinates</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\theight</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">2.0 m</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Cell methods</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\t0</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">area: time: mean</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Attributes</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tConventions</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CF-1.7 CMIP-6.2&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tactivity_drs</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tactivity_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_method</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Hybrid-restart from year 0671-01-01 of piControl&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_time</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.float64(0.0)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_time_in_child</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.float64(0.0)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_time_in_parent</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.float64(171550.0)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcmor_version</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;3.5.0&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcomment</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;near-surface (usually, 2 meter) air temperature&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcontact</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Dr. Wei-Liang Lee (leelupin@gate.sinica.edu.tw)&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcreation_date</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;2020-06-08T08:53:23Z&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tdata_specs_version</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;01.00.31&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\texperiment</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;all-forcing simulation of the recent past&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\texperiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;historical&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\texternal_variables</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;areacella&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tforcing_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tfrequency</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;mon&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tfurther_info_url</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;https://furtherinfo.es-doc.org/CMIP6.AS-RCEC.TaiESM1.historical.none.r ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tgrid</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;finite-volume grid with 0.9x1.25 degree lat/lon resolution&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tgrid_label</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;gn&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\thistory</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&quot;2020-06-08T08:53:23Z altered by CMOR: Treated scalar dimension: &#x27;height&#x27;. ...&quot;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tinitialization_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tinstitution</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Research Center for Environmental Changes, Academia Sinica, Nankang, Taipei ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tinstitution_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;AS-RCEC&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tlicense</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP6 model data produced by NCC is licensed under a Creative Commons Attribution ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tmember_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;r1i1p1f1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tmip_era</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP6&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tmodel_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tnominal_resolution</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;100 km&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\toriginal_name</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TREFHT&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_activity_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_experiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;piControl&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_mip_era</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP6&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_source_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_sub_experiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;none&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_time_units</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;days since 1850-01-01&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_variant_label</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;r1i1p1f1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tphysics_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tproduct</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;model-output&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\trealization_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\trealm</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;atmos&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\treferences</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;10.5194/gmd-2019-377&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\trun_variant</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;N/A&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM 1.0 (2018): \\naerosol: SNAP (same grid as atmos)\\natmos: TaiAM1 ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource_file</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;/mnt/d/data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource_type</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;AOGCM AER BGC&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsub_experiment</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;none&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsub_experiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;none&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttable_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Amon&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttable_info</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Creation Date:(24 July 2019) MD5:0bb394a356ef9d214d027f1aca45853e&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttitle</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1 output prepared for CMIP6&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttracking_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;hdl:21.14100/997cf563-6411-4a78-a9c4-7369ae27d698&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tvariable_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;tas&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tvariant_label</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;r1i1p1f1&#x27;</td>\n",
+       "</tr>\n",
+       "</table>\n",
+       "        </p>\n",
+       "</div>\n",
+       "    \n",
+       "    "
+      ],
+      "text/plain": [
+       "[<iris 'Cube' of air_temperature / (K) (time: 1980; latitude: 192; longitude: 288)>]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "cubes = dataset.files[0].to_iris()"
+    "cubes = dataset.files[0].to_iris()\n",
+    "cubes"
    ]
   },
   {
@@ -391,14 +753,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 38 datasets, showing the first 10:\n"
+      "Found 37 datasets, showing the first 10:\n"
      ]
     },
     {
      "data": {
       "text/plain": [
        "[Dataset:\n",
-       " {'dataset': 'SAM0-UNICON',\n",
+       " {'dataset': 'GISS-E2-2-G',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -407,9 +769,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'SNU'},\n",
+       "  'institute': 'NASA-GISS'},\n",
        " Dataset:\n",
-       " {'dataset': 'AWI-ESM-1-1-LR',\n",
+       " {'dataset': 'FGOALS-g3',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -418,9 +780,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'AWI'},\n",
+       "  'institute': 'CAS'},\n",
        " Dataset:\n",
-       " {'dataset': 'CMCC-CM2-HR4',\n",
+       " {'dataset': 'CESM2-WACCM-FV2',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -429,9 +791,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'CMCC'},\n",
+       "  'institute': 'NCAR'},\n",
        " Dataset:\n",
-       " {'dataset': 'CESM2-WACCM',\n",
+       " {'dataset': 'GISS-E2-1-H',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -440,9 +802,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'NCAR'},\n",
+       "  'institute': 'NASA-GISS'},\n",
        " Dataset:\n",
-       " {'dataset': 'BCC-ESM1',\n",
+       " {'dataset': 'BCC-CSM2-MR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -453,7 +815,7 @@
        "  'grid': 'gn',\n",
        "  'institute': 'BCC'},\n",
        " Dataset:\n",
-       " {'dataset': 'GISS-E2-1-H',\n",
+       " {'dataset': 'CAS-ESM2-0',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -462,9 +824,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'NASA-GISS'},\n",
+       "  'institute': 'CAS'},\n",
        " Dataset:\n",
-       " {'dataset': 'MRI-ESM2-0',\n",
+       " {'dataset': 'MPI-ESM-1-2-HAM',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -473,9 +835,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'MRI'},\n",
+       "  'institute': 'HAMMOZ-Consortium'},\n",
        " Dataset:\n",
-       " {'dataset': 'TaiESM1',\n",
+       " {'dataset': 'CESM2-FV2',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -484,9 +846,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'AS-RCEC'},\n",
+       "  'institute': 'NCAR'},\n",
        " Dataset:\n",
-       " {'dataset': 'CAMS-CSM1-0',\n",
+       " {'dataset': 'BCC-ESM1',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -495,9 +857,9 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'CAMS'},\n",
+       "  'institute': 'BCC'},\n",
        " Dataset:\n",
-       " {'dataset': 'MPI-ESM-1-2-HAM',\n",
+       " {'dataset': 'ICON-ESM-LR',\n",
        "  'project': 'CMIP6',\n",
        "  'mip': 'Amon',\n",
        "  'short_name': 'lwcre',\n",
@@ -506,7 +868,7 @@
        "  'exp': 'historical',\n",
        "  'force_derivation': True,\n",
        "  'grid': 'gn',\n",
-       "  'institute': 'HAMMOZ-Consortium'}]"
+       "  'institute': 'MPI-M'}]"
       ]
      },
      "execution_count": 9,
@@ -560,7 +922,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "97cdf12d",
    "metadata": {},
    "outputs": [
@@ -569,9 +931,9 @@
      "output_type": "stream",
      "text": [
       "rlut\n",
-      "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlut.gn')]\n",
+      "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlut.gn')]\n",
       "rlutcs\n",
-      "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlutcs.gn')]\n"
+      "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlutcs.gn')]\n"
      ]
     }
    ],

From d3adbce48e7fafdc5b983a4e78f7a8a0a3cf416e Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Mon, 19 Jan 2026 19:04:22 +0100
Subject: [PATCH 84/85] Required datasets don't need supplementaries

---
 esmvalcore/dataset.py      |   2 -
 tests/unit/test_dataset.py | 119 -------------------------------------
 2 files changed, 121 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index d9c781b794..853e86bdaa 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -212,8 +212,6 @@ def _get_required_datasets(self) -> list[Dataset]:
 
         for required_facets in required_vars_facets:
             required_dataset = self._copy(derive=False, force_derivation=False)
-            for supplementary in self.supplementaries:
-                required_dataset.supplementaries.append(supplementary.copy())
             keep = {"alias", "recipe_dataset_index", *self.minimal_facets}
             required_dataset.facets = {
                 k: v for k, v in required_dataset.facets.items() if k in keep
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index e605124de7..f70fd551f0 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1622,19 +1622,6 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session):
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -1754,19 +1741,6 @@ def test_from_files_with_derived_glob(
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -1827,19 +1801,6 @@ def test_from_files_with_derived_glob_differing_timerange(
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2048,20 +2009,6 @@ def test_from_files_with_derived_force_derivation(
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                force_derivation=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2133,20 +2080,6 @@ def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                force_derivation=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2239,19 +2172,6 @@ def test_from_files_with_derived_optional(
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2334,19 +2254,6 @@ def test_from_files_with_derived_glob_optional(
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -2486,19 +2393,6 @@ def test_from_files_with_derived_only_optional(siconca_file, pr_file, session):
         ),
     ]
     for expected_ds in expected_required_datasets:
-        expected_ds.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_ds.session = session
 
     required_datasets = datasets[0].required_datasets
@@ -3658,19 +3552,6 @@ def test_required_datasets_derivation(session):
         ),
     ]
     for expected_dataset in expected_datasets:
-        expected_dataset.supplementaries = [
-            Dataset(
-                **OBS6_SAT_FACETS,
-                short_name="pr",
-                derive=False,
-                frequency="mon",
-                long_name="Precipitation",
-                modeling_realm=["atmos"],
-                original_short_name="pr",
-                standard_name="precipitation_flux",
-                units="kg m-2 s-1",
-            ),
-        ]
         expected_dataset.session = dataset.session
 
     assert dataset.required_datasets == expected_datasets

From 8c055a5f4372b444e7fade80d20dce0ba153d405 Mon Sep 17 00:00:00 2001
From: Manuel Schlund <manuel.schlund@dlr.de>
Date: Tue, 20 Jan 2026 10:07:20 +0100
Subject: [PATCH 85/85] Make _derivation_necessary faster by avoiding extra
 calls to dataset.files

---
 esmvalcore/dataset.py | 46 ++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 853e86bdaa..6d826b7ca2 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -182,25 +182,9 @@ def _is_force_derived(self) -> bool:
 
     def _derivation_necessary(self) -> bool:
         """Return ``True`` if derivation is necessary, ``False`` otherwise."""
-        # If variable cannot be derived, derivation is not necessary
-        if not self._is_derived():
-            return False
-
-        # If forced derivation is requested, derivation is necessary
-        if self._is_force_derived():
-            return True
-
-        # Otherwise, derivation is necessary if no files for the self dataset
-        # are found
-        ds_copy = self.copy()
-        ds_copy.supplementaries = []
-
-        # Avoid potential errors from missing data during timerange glob
-        # expansion
-        if _isglob(ds_copy.facets.get("timerange", "")):
-            ds_copy.facets.pop("timerange", None)
-
-        return not ds_copy.files
+        return not (
+            self.required_datasets and self.required_datasets[0] is self
+        )
 
     def _get_required_datasets(self) -> list[Dataset]:
         """Get required datasets for derivation."""
@@ -242,7 +226,29 @@ def required_datasets(self) -> list[Dataset]:
         if self._required_datasets is not None:
             return self._required_datasets
 
-        if not self._derivation_necessary():
+        def _derivation_needed(dataset: Dataset) -> bool:
+            """Check if derivation is nedeed."""
+            # If variable cannot be derived, derivation is not necessary
+            if not dataset._is_derived():
+                return False
+
+            # If forced derivation is requested, derivation is necessary
+            if dataset._is_force_derived():
+                return True
+
+            # Otherwise, derivation is necessary if no files for the self
+            # dataset are found
+            ds_copy = dataset.copy()
+            ds_copy.supplementaries = []
+
+            # Avoid potential errors from missing data during timerange glob
+            # expansion
+            if _isglob(ds_copy.facets.get("timerange", "")):
+                ds_copy.facets.pop("timerange", None)
+
+            return not ds_copy.files
+
+        if not _derivation_needed(self):
             self._required_datasets = [self]
         else:
             self._required_datasets = self._get_required_datasets()