From 4b989d3305a1634d17b5b7cba005e4e30cffd53c Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 15:37:57 +0200 Subject: [PATCH 01/85] Remove all new features, just keep no-op changes --- esmvalcore/_recipe/check.py | 86 ++++++------------- esmvalcore/_recipe/recipe.py | 4 +- esmvalcore/_recipe/to_datasets.py | 2 +- esmvalcore/dataset.py | 43 ++++++---- esmvalcore/local.py | 14 +-- esmvalcore/preprocessor/__init__.py | 2 + esmvalcore/preprocessor/_derive/__init__.py | 55 +++++++----- esmvalcore/preprocessor/_derive/_baseclass.py | 35 ++++---- esmvalcore/preprocessor/_derive/qep.py | 4 +- esmvalcore/typing.py | 2 +- 10 files changed, 115 insertions(+), 132 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index d937cc9432..50e41d7d21 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -33,6 +33,9 @@ if TYPE_CHECKING: from collections.abc import Iterable + from esmvalcore.dataset import Dataset + from esmvalcore.typing import FacetValue + logger = logging.getLogger(__name__) @@ -43,9 +46,7 @@ def ncl_version(): msg = ( "Recipe contains NCL scripts, but cannot find an NCL installation." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) try: cmd = [ncl, "-V"] version = subprocess.check_output(cmd, universal_newlines=True) @@ -55,9 +56,7 @@ def ncl_version(): "Recipe contains NCL scripts, but your NCL " "installation appears to be broken." ) - raise RecipeError( - msg, - ) from exc + raise RecipeError(msg) from exc version = version.strip() logger.info("Found NCL version %s", version) @@ -68,9 +67,7 @@ def ncl_version(): "NCL version 6.4 or higher is required to run " "a recipe containing NCL scripts." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def recipe_with_schema(filename): @@ -90,9 +87,7 @@ def diagnostics(diags): for name, diagnostic in diags.items(): if "scripts" not in diagnostic: msg = f"Missing scripts section in diagnostic '{name}'." - raise RecipeError( - msg, - ) + raise RecipeError(msg) variable_names = tuple(diagnostic.get("variables", {})) scripts = diagnostic.get("scripts") if scripts is None: @@ -104,17 +99,13 @@ def diagnostics(diags): f"in diagnostic '{name}': scripts cannot have the " "same name as variables." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) if not script.get("script"): msg = ( f"No script defined for script '{script_name}' in " f"diagnostic '{name}'." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def duplicate_datasets( @@ -129,9 +120,7 @@ def duplicate_datasets( f"groups for variable '{variable_group}' in diagnostic " f"'{diagnostic}'." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) checked_datasets_ = [] for dataset in datasets: if dataset in checked_datasets_: @@ -139,9 +128,7 @@ def duplicate_datasets( f"Duplicate dataset\n{pformat(dataset)}\nfor variable " f"'{variable_group}' in diagnostic '{diagnostic}'." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) checked_datasets_.append(dataset) @@ -260,9 +247,7 @@ def preprocessor_supplementaries(dataset, settings): f"one supplementary variable of {ancs['variables']} is " f"defined in the recipe for {dataset}." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) if ancs["required"] == "prefer_at_least_one": logger.warning( "Preprocessor function %s works best when at least " @@ -298,9 +283,7 @@ def check_for_temporal_preprocs(profile): f"Time coordinate preprocessor step(s) {temp_preprocs} not permitted on fx " "vars, please remove them from recipe" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def extract_shape(settings): @@ -311,9 +294,7 @@ def extract_shape(settings): "In preprocessor function `extract_shape`: " f"Unable to find 'shapefile: {shapefile}'" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) valid = { "method": {"contains", "representative"}, @@ -323,11 +304,12 @@ def extract_shape(settings): for key in valid: value = settings.get(key) if not (value is None or value in valid[key]): - raise RecipeError( + msg = ( f"In preprocessor function `extract_shape`: Invalid value " f"'{value}' for argument '{key}', choose from " "{}".format(", ".join(f"'{k}'".lower() for k in valid[key])), ) + raise RecipeError(msg) def _verify_span_value(span): @@ -352,9 +334,7 @@ def _verify_groupby(groupby): "`multi_model_statistics`.`groupby` must be defined as a " f"list. Got {groupby}." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def _verify_keep_input_datasets(keep_input_datasets): @@ -364,9 +344,7 @@ def _verify_keep_input_datasets(keep_input_datasets): f"Must be defined as a boolean (true or false). " f"Got {keep_input_datasets}." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def _verify_ignore_scalar_coords(ignore_scalar_coords): @@ -376,9 +354,7 @@ def _verify_ignore_scalar_coords(ignore_scalar_coords): f"Must be defined as a boolean (true or false). Got " f"{ignore_scalar_coords}." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def multimodel_statistics_preproc(settings): @@ -415,9 +391,7 @@ def _check_delimiter(timerange): "Valid values must be separated by `/`. " f"Got {timerange} instead." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def _check_duration_periods(timerange): @@ -428,9 +402,7 @@ def _check_duration_periods(timerange): "Cannot set both the beginning and the end " "as duration periods." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) if timerange[0].startswith("P"): try: @@ -523,9 +495,7 @@ def _check_literal( f"Expected one of {allowed_values} for option `{option}` of " f"preprocessor `{step}`, got '{user_value}'" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) bias_type = partial( @@ -595,9 +565,7 @@ def _check_ref_attributes(products: set, *, step: str, attr_name: str) -> None: f"ensure that the reference dataset is not excluded with the " f"'exclude' option" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) reference_for_bias_preproc = partial( @@ -708,9 +676,7 @@ def regridding_schemes(settings: dict): f"(see https://docs.esmvaltool.org/projects/ESMValCore/en/" f"latest/recipe/preprocessor.html#generic-regridding-schemes)." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) # Check generic regridding schemes (given as dict) if isinstance(scheme, dict): @@ -723,6 +689,4 @@ def regridding_schemes(settings: dict): f"/recipe/preprocessor.html#generic-regridding-schemes for " f"details." ) - raise RecipeError( - msg, - ) from exc + raise RecipeError(msg) from exc diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index cb1e1e8275..42b63962ed 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -1005,9 +1005,7 @@ def _resolve_diagnostic_ancestors(self, tasks): "Could not find any ancestors matching " f"'{id_glob}'." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) logger.debug( "Pattern %s matches %s", id_glob, diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index d2c207224a..08ef4680db 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -216,7 +216,7 @@ def _get_supplementary_short_names( var_facets = dict(facets) _update_cmor_facets(var_facets) realms = var_facets.get("modeling_realm", []) - if isinstance(realms, (str, Number)): + if isinstance(realms, (str, Number, bool)): realms = [str(realms)] ocean_realms = {"ocean", "seaIce", "ocnBgchem"} is_ocean_variable = any(realm in ocean_realms for realm in realms) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 050a9e7326..143f59c1a9 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -410,6 +410,16 @@ def _fix_fx_exp(self) -> None: ) break + def _copy(self, **facets: FacetValue) -> Dataset: + """Create a copy of the parent dataset without supplementaries.""" + new = self.__class__() + new._session = self._session # noqa: SLF001 + for key, value in self.facets.items(): + new.set_facet(key, deepcopy(value), key in self._persist) + for key, value in facets.items(): + new.set_facet(key, deepcopy(value)) + return new + def copy(self, **facets: FacetValue) -> Dataset: """Create a copy. @@ -425,12 +435,7 @@ def copy(self, **facets: FacetValue) -> Dataset: Dataset A copy of the dataset. """ - new = self.__class__() - new._session = self._session # noqa: SLF001 - for key, value in self.facets.items(): - new.set_facet(key, deepcopy(value), key in self._persist) - for key, value in facets.items(): - new.set_facet(key, deepcopy(value)) + new = self._copy(**facets) for supplementary in self.supplementaries: # The short_name and mip of the supplementary variable are probably # different from the main variable, so don't copy those facets. @@ -440,6 +445,7 @@ def copy(self, **facets: FacetValue) -> Dataset: } new_supplementary = supplementary.copy(**supplementary_facets) new.supplementaries.append(new_supplementary) + return new def __eq__(self, other) -> bool: @@ -477,8 +483,8 @@ def facets2str(facets): if self.supplementaries: txt.append("supplementaries:") txt.extend( - textwrap.indent(facets2str(a.facets), " ") - for a in self.supplementaries + textwrap.indent(facets2str(s.facets), " ") + for s in self.supplementaries ) if self._session: txt.append(f"session: '{self.session.session_name}'") @@ -532,10 +538,11 @@ def supplementary_summary(dataset): txt += ( ", supplementaries: " + "; ".join( - supplementary_summary(a) for a in self.supplementaries + supplementary_summary(s) for s in self.supplementaries ) + "" ) + return txt def __getitem__(self, key): @@ -544,7 +551,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): """Set a facet value.""" - self.facets[key] = value + self.set_facet(key, value, persist=False) def set_facet(self, key: str, value: FacetValue, persist: bool = True): """Set facet. @@ -609,15 +616,19 @@ def add_supplementary(self, **facets: FacetValue) -> None: **facets Facets describing the supplementary variable. """ + if self.is_derived(): + facets.setdefault("derive", False) + if self.facets.get("force_derivation", False): + facets.setdefault("force_derivation", False) supplementary = self.copy(**facets) supplementary.supplementaries = [] self.supplementaries.append(supplementary) def augment_facets(self) -> None: - """Add extra facets. + """Add additional facets. - This function will update the dataset with additional facets - from various sources. + This function will update the dataset with additional facets from + various sources. """ self._augment_facets() for supplementary in self.supplementaries: @@ -749,7 +760,7 @@ def _find_files(self) -> None: self.files[idx] = file @property - def files(self) -> Sequence[File]: + def files(self) -> list[File]: """The files associated with this dataset.""" if self._files is None: self.find_files() @@ -949,9 +960,7 @@ def _update_timerange(self): timerange = self.facets["timerange"] if not isinstance(timerange, str): msg = f"timerange should be a string, got '{timerange!r}'" - raise TypeError( - msg, - ) + raise TypeError(msg) check.valid_time_selection(timerange) if "*" in timerange: diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 81d2386188..baeeac2757 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -239,15 +239,16 @@ def _replace_years_with_timerange(variable): variable.pop("end_year", None) -def _parse_period(timerange): +def _parse_period(timerange: FacetValue) -> tuple[str, str]: """Parse `timerange` values given as duration periods. Sum the duration periods to the `timerange` value given as a reference point in order to compute the start and end dates needed for file selection. """ - start_date = None - end_date = None + timerange = str(timerange) + start_date: str | None = None + end_date: str | None = None time_format = None datetime_format = ( isodate.DATE_BAS_COMPLETE + "T" + isodate.TIME_BAS_COMPLETE @@ -284,8 +285,9 @@ def _parse_period(timerange): ) end_date = str(isodate.date_isoformat(end_date, format=time_format)) - if start_date is None and end_date is None: + if start_date is None: start_date = timerange.split("/")[0] + if end_date is None: end_date = timerange.split("/")[1] return start_date, end_date @@ -379,9 +381,7 @@ def _replace_tags( f"Dataset key '{tag}' must be specified for {variable}, check " f"your recipe entry and/or extra facet file(s)" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) pathset = _replace_tag(pathset, original_tag, replacewith) return [Path(p) for p in pathset] diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index da5eea2f78..9c80676f7a 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -99,6 +99,8 @@ from dask.delayed import Delayed + from esmvalcore.dataset import Dataset + logger = logging.getLogger(__name__) __all__ = [ diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index f473956056..cbd181d2c8 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -5,14 +5,17 @@ from copy import deepcopy from pathlib import Path -import iris +from cf_units import Unit +from iris.cube import Cube, CubeList +from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase from esmvalcore.preprocessor._units import convert_units +from esmvalcore.typing import Facets, FacetValue logger = logging.getLogger(__name__) -def _get_all_derived_variables(): +def _get_all_derived_variables() -> dict[str, type[DerivedVariableBase]]: """Get all possible derived variables. Returns @@ -31,55 +34,63 @@ def _get_all_derived_variables(): return derivers -ALL_DERIVED_VARIABLES = _get_all_derived_variables() +ALL_DERIVED_VARIABLES: dict[str, type[DerivedVariableBase]] = ( + _get_all_derived_variables() +) __all__ = list(ALL_DERIVED_VARIABLES) -def get_required(short_name, project): +def get_required(short_name: FacetValue, project: FacetValue) -> list[Facets]: """Return all required variables for derivation. - Get all information (at least `short_name`) required for derivation. + Get all information (at least ``short_name``) required for derivation. Parameters ---------- - short_name : str - `short_name` of the variable to derive. - project : str - `project` of the variable to derive. + short_name: + Short name of the variable to derive. + project: + Project of the variable to derive. Returns ------- - list - List of dictionaries (including at least the key `short_name`). + list[esmvalcore.typing.Facets] + List of facets (including at least the key ``short_name``). + """ + short_name = str(short_name) if short_name.lower() not in ALL_DERIVED_VARIABLES: msg = ( - f"Cannot derive variable '{short_name}', no derivation script " + f"Cannot derive variable '{short_name}': no derivation script " f"available" ) - raise NotImplementedError( - msg, - ) + raise NotImplementedError(msg) DerivedVariable = ALL_DERIVED_VARIABLES[short_name.lower()] # noqa: N806 return deepcopy(DerivedVariable().required(project)) -def derive(cubes, short_name, long_name, units, standard_name=None): +def derive( + cubes: CubeList, + short_name: str, + long_name: str, + units: str | Unit, + standard_name: str | None = None, +) -> Cube: """Derive variable. Parameters ---------- - cubes: iris.cube.CubeList + cubes: Includes all the needed variables for derivation defined in :func:`get_required`. - short_name: str + short_name: short_name - long_name: str + long_name: long_name - units: str + units: units - standard_name: str, optional + standard_name: standard_name Returns @@ -90,7 +101,7 @@ def derive(cubes, short_name, long_name, units, standard_name=None): if short_name == cubes[0].var_name: return cubes[0] - cubes = iris.cube.CubeList(cubes) + cubes = CubeList(cubes) # Derive variable DerivedVariable = ALL_DERIVED_VARIABLES[short_name.lower()] # noqa: N806 diff --git a/esmvalcore/preprocessor/_derive/_baseclass.py b/esmvalcore/preprocessor/_derive/_baseclass.py index 2d818f1ca3..4e71f66dd6 100644 --- a/esmvalcore/preprocessor/_derive/_baseclass.py +++ b/esmvalcore/preprocessor/_derive/_baseclass.py @@ -2,13 +2,17 @@ from abc import abstractmethod +from iris.cube import Cube, CubeList + +from esmvalcore.typing import Facets, FacetValue + class DerivedVariableBase: """Base class for derived variables.""" @staticmethod @abstractmethod - def required(project): + def required(project: FacetValue) -> list[Facets]: """Return required variables for derivation. This method needs to be overridden in the child class belonging to the @@ -16,27 +20,27 @@ def required(project): Note ---- - It is possible to declare a required variable as `optional=True`, which - allows the skipping of this particular variable during data extraction. - For example, this is useful for fx variables which are often not - available for observational datasets. Otherwise, the tool will fail if - not all required variables are available for all datasets. + It is possible to declare a required variable as ``optional=True``, + which allows the skipping of this particular variable during data + extraction. For example, this is useful for fx variables which are + often not available for observational datasets. Otherwise, the tool + will fail if not all required variables are available for all datasets. Parameters ---------- - project : str + project: Project of the dataset for which the desired variable is derived. Returns ------- - list of dict - List of variable metadata. + list[esmvalcore.typing.Facets] + List of facets. """ @staticmethod @abstractmethod - def calculate(cubes): + def calculate(cubes: CubeList) -> Cube: """Compute desired derived variable. This method needs to be overridden in the child class belonging to the @@ -44,20 +48,13 @@ def calculate(cubes): Parameters ---------- - cubes : iris.cube.CubeList + cubes: Includes all the needed variables (incl. fx variables) for - derivation defined in the static class variable - `_required_variables`. + derivation defined in ``required``. Returns ------- iris.cube.Cube New derived variable. - Raises - ------ - NotImplementedError - If the desired variable derivation is not implemented, i.e. if this - method is called from this base class and not a child class. - """ diff --git a/esmvalcore/preprocessor/_derive/qep.py b/esmvalcore/preprocessor/_derive/qep.py index 3626b5abdf..19d677f618 100644 --- a/esmvalcore/preprocessor/_derive/qep.py +++ b/esmvalcore/preprocessor/_derive/qep.py @@ -3,6 +3,8 @@ from iris import Constraint from iris.cube import Cube, CubeList +from esmvalcore.typing import Facets, FacetValue + from ._baseclass import DerivedVariableBase @@ -10,7 +12,7 @@ class DerivedVariable(DerivedVariableBase): """Derivation of variable `qep`.""" @staticmethod - def required(project: str) -> list[dict[str, str]]: + def required(project: FacetValue) -> list[Facets]: """Declare the variables needed for derivation.""" return [ {"short_name": "evspsbl"}, diff --git a/esmvalcore/typing.py b/esmvalcore/typing.py index 361f886535..7880bdac1b 100644 --- a/esmvalcore/typing.py +++ b/esmvalcore/typing.py @@ -9,7 +9,7 @@ import numpy as np from iris.cube import Cube -FacetValue = str | Sequence[str] | Number +FacetValue = str | Sequence[str] | Number | bool """Type describing a single facet.""" Facets = dict[str, FacetValue] From b0c44f65bd8d8ab9fedfb097c58f281b5bbe1015 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 15:42:15 +0200 Subject: [PATCH 02/85] Further no-op changes --- esmvalcore/_recipe/check.py | 2 -- esmvalcore/dataset.py | 20 ++++++++++++++++++++ esmvalcore/preprocessor/__init__.py | 2 -- esmvalcore/preprocessor/_derive/ohc.py | 16 ++++++---------- esmvalcore/preprocessor/_derive/vegfrac.py | 5 ++++- 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index 50e41d7d21..738159bc17 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -33,8 +33,6 @@ if TYPE_CHECKING: from collections.abc import Iterable - from esmvalcore.dataset import Dataset - from esmvalcore.typing import FacetValue logger = logging.getLogger(__name__) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 143f59c1a9..e1ef99dd20 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -164,6 +164,26 @@ def from_recipe( return datasets_from_recipe(recipe, session) + def is_derived(self) -> bool: + """Return ``True`` for derived variables, ``False`` otherwise.""" + return bool(self.facets.get("derive", False)) + + def derivation_necessary(self) -> bool: + """Return ``True`` if derivation is necessary, ``False`` otherwise.""" + # If variable cannot be derived, derivation is not necessary + if not self.is_derived(): + return False + + # If forced derivation is requested, derivation is necessary + if self.facets.get("force_derivation", False): + return True + + # Otherwise, derivation is necessary of no files for the self dataset + # are found + ds_copy = self.copy() + ds_copy.supplementaries = [] + return not ds_copy.files + def _file_to_dataset( self, file: esgf.ESGFFile | local.LocalFile, diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 9c80676f7a..da5eea2f78 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -99,8 +99,6 @@ from dask.delayed import Delayed - from esmvalcore.dataset import Dataset - logger = logging.getLogger(__name__) __all__ = [ diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py index ff970e2641..47aa7b2fc8 100644 --- a/esmvalcore/preprocessor/_derive/ohc.py +++ b/esmvalcore/preprocessor/_derive/ohc.py @@ -15,16 +15,12 @@ class DerivedVariable(DerivedVariableBase): @staticmethod def required(project): """Declare the variables needed for derivation.""" - required = [ - {"short_name": "thetao"}, - {"short_name": "volcello", "mip": "fx"}, - ] - if project == "CMIP6": - required = [ - {"short_name": "thetao"}, - {"short_name": "volcello", "mip": "Ofx"}, - ] - return required + volcello = {"short_name": "volcello", "mip": "fx"} + if project == "CMIP5": + volcello["ensemble"] = "r0i0p0" + elif project == "CMIP6": + volcello["mip"] = "Ofx" + return [{"short_name": "thetao"}, volcello] @staticmethod def calculate(cubes): diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py index 3cd00a2cb2..c48e723f42 100644 --- a/esmvalcore/preprocessor/_derive/vegfrac.py +++ b/esmvalcore/preprocessor/_derive/vegfrac.py @@ -15,10 +15,13 @@ class DerivedVariable(DerivedVariableBase): @staticmethod def required(project): """Declare the variables needed for derivation.""" + sftlf = {"short_name": "sftlf", "mip": "fx"} + if project == "CMIP5": + sftlf["ensemble"] = "r0i0p0" return [ {"short_name": "baresoilFrac"}, {"short_name": "residualFrac"}, - {"short_name": "sftlf", "mip": "fx"}, + sftlf, ] @staticmethod From 1dd5671d043ec687827f9daaad0b0f975bb9c4bd Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 15:44:07 +0200 Subject: [PATCH 03/85] force_derivation=True without derive=True does not make sense --- esmvalcore/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index e1ef99dd20..65169e1644 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -136,6 +136,16 @@ def __init__(self, **facets: FacetValue): for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) + if not self.is_derived() and self.facets.get( + "force_derivation", + False, + ): + msg = ( + "Facet `force_derivation=True` can only be used for derived " + "variables (i.e., with facet `derive=True`)" + ) + raise ValueError(msg) + @staticmethod def from_recipe( recipe: Path | str | dict, From 8989549a9529600e4217b8a9ad862a715563146b Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 15:48:22 +0200 Subject: [PATCH 04/85] Add tests --- tests/unit/test_dataset.py | 146 +++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 291ff99fe2..9624f697f6 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2135,3 +2135,149 @@ def test_get_extra_facets_native6(): "grib_id": "130", "tres": "1M", } + + +def test_derivation_necessary_no_derivation(): + dataset = Dataset( + project="OBS6", + dataset="SAT", + mip="Amon", + short_name="tas", + tier=2, + type="sat", + timerange="1980/2000", + ) + assert not dataset.derivation_necessary() + + +def test_derivation_necessary_no_force_derivation_no_files(): + dataset = Dataset( + project="OBS6", + dataset="SAT", + mip="Amon", + short_name="asr", + tier=2, + type="sat", + timerange="1980/2000", + derive=True, + ) + assert dataset.derivation_necessary() + + +def test_derivation_necessary_no_force_derivation(tmp_path, session): + dataset = Dataset( + project="OBS6", + dataset="SAT", + mip="Amon", + short_name="asr", + tier=2, + type="sat", + timerange="1980/2000", + derive=True, + ) + dataset.session = session + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + asr_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", + ) + asr_file.touch() + assert not dataset.derivation_necessary() + + +def test_derivation_necessary_force_derivation(tmp_path, session): + dataset = Dataset( + project="CMIP6", + dataset="CanESM5", + mip="Amon", + short_name="lwcre", + exp="historical", + grid="gn", + ensemble="r1i1p1f1", + derive=True, + force_derivation=True, + ) + dataset.session = session + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + asr_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", + ) + asr_file.touch() + assert dataset.derivation_necessary() + + +def test_force_derivation_no_derived(): + msg = ( + r"Facet `force_derivation=True` can only be used for derived " + r"variables" + ) + + with pytest.raises(ValueError, match=msg): + Dataset( + project="CMIP6", + dataset="CanESM5", + mip="Amon", + short_name="tas", + force_derivation=True, + ) + + with pytest.raises(ValueError, match=msg): + Dataset( + project="CMIP6", + dataset="CanESM5", + mip="Amon", + short_name="tas", + derive=False, + force_derivation=True, + ) + + +def test_add_supplementary_to_derived(): + dataset = Dataset( + project="CMIP6", + dataset="CanESM5", + mip="Amon", + short_name="lwcre", + derive=True, + force_derivation=True, + ) + + dataset.add_supplementary(short_name="areacella", mip="fx") + + expected_supplementary = Dataset( + project="CMIP6", + dataset="CanESM5", + mip="fx", + short_name="areacella", + derive=False, + force_derivation=False, + ) + assert dataset.supplementaries[0] == expected_supplementary + + +def test_add_derived_supplementary_to_derived(): + dataset = Dataset( + project="CMIP6", + dataset="CanESM5", + mip="Amon", + short_name="lwcre", + derive=True, + force_derivation=True, + ) + + dataset.add_supplementary( + short_name="asr", + derive=True, + force_derivation=True, + ) + + expected_supplementary = Dataset( + project="CMIP6", + dataset="CanESM5", + mip="Amon", + short_name="asr", + derive=True, + force_derivation=True, + ) + assert dataset.supplementaries[0] == expected_supplementary From 1f6dfa3c9f101bcbe7873588ef0997e1924dac09 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 16:08:17 +0200 Subject: [PATCH 05/85] Add type hints to check.py --- esmvalcore/_recipe/check.py | 95 +++++++++++++++++-------------- esmvalcore/_recipe/to_datasets.py | 2 +- 2 files changed, 52 insertions(+), 45 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index 738159bc17..dd37a31893 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -31,13 +31,18 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence + from pathlib import Path + + from esmvalcore._task import TaskSet + from esmvalcore.dataset import Dataset + from esmvalcore.typing import Facets, FacetValue logger = logging.getLogger(__name__) -def ncl_version(): +def ncl_version() -> None: """Check the NCL version.""" ncl = which("ncl") if not ncl: @@ -68,7 +73,7 @@ def ncl_version(): raise RecipeError(msg) -def recipe_with_schema(filename): +def recipe_with_schema(filename: Path) -> None: """Check if the recipe content matches schema.""" schema_file = os.path.join(os.path.dirname(__file__), "recipe_schema.yml") logger.debug("Checking recipe against schema %s", schema_file) @@ -77,7 +82,7 @@ def recipe_with_schema(filename): yamale.validate(schema, recipe, strict=False) -def diagnostics(diags): +def diagnostics(diags: dict[str, dict[str, Any]] | None) -> None: """Check diagnostics in recipe.""" if diags is None: msg = "The given recipe does not have any diagnostic." @@ -149,7 +154,7 @@ def variable( ) -def _log_data_availability_errors(dataset): +def _log_data_availability_errors(dataset: Dataset) -> None: """Check if the required input data is available.""" input_files = dataset.files patterns = dataset._file_globs # noqa: SLF001 @@ -164,7 +169,7 @@ def _log_data_availability_errors(dataset): logger.error("Set 'log_level' to 'debug' to get more information") -def _group_years(years): +def _group_years(years: Iterable[int]) -> str: """Group an iterable of years into easy to read text. Example @@ -190,7 +195,7 @@ def _group_years(years): return ", ".join(ranges) -def data_availability(dataset, log=True): +def data_availability(dataset: Dataset, log: bool = True) -> None: """Check if input_files cover the required years.""" input_files = dataset.files facets = dataset.facets @@ -209,7 +214,7 @@ def data_availability(dataset, log=True): start_year = int(start_date[0:4]) end_year = int(end_date[0:4]) required_years = set(range(start_year, end_year + 1, 1)) - available_years = set() + available_years: set[int] = set() for file in input_files: start, end = _get_start_end_year(file) @@ -228,7 +233,10 @@ def data_availability(dataset, log=True): ) -def preprocessor_supplementaries(dataset, settings): +def preprocessor_supplementaries( + dataset: Dataset, + settings: dict[str, Any], +) -> None: """Check that the required supplementary variables have been added.""" steps = [step for step in settings if step in PREPROCESSOR_SUPPLEMENTARIES] supplementaries = {d.facets["short_name"] for d in dataset.supplementaries} @@ -257,7 +265,7 @@ def preprocessor_supplementaries(dataset, settings): ) -def tasks_valid(tasks): +def tasks_valid(tasks: TaskSet) -> None: """Check that tasks are consistent.""" filenames = set() msg = "Duplicate preprocessor filename {}, please file a bug report." @@ -269,7 +277,7 @@ def tasks_valid(tasks): filenames.add(product.filename) -def check_for_temporal_preprocs(profile): +def check_for_temporal_preprocs(profile: dict[str, Any]) -> None: """Check for temporal operations on fx variables.""" temp_preprocs = [ preproc @@ -284,17 +292,17 @@ def check_for_temporal_preprocs(profile): raise RecipeError(msg) -def extract_shape(settings): +def extract_shape(settings: dict[str, Any]) -> None: """Check that `extract_shape` arguments are valid.""" shapefile = settings.get("shapefile", "") if not os.path.exists(shapefile): msg = ( - "In preprocessor function `extract_shape`: " - f"Unable to find 'shapefile: {shapefile}'" + f"In preprocessor function `extract_shape`: Unable to find " + f"'shapefile: {shapefile}'" ) raise RecipeError(msg) - valid = { + valid: dict[str, set[Any]] = { "method": {"contains", "representative"}, "crop": {True, False}, "decomposed": {True, False}, @@ -305,12 +313,12 @@ def extract_shape(settings): msg = ( f"In preprocessor function `extract_shape`: Invalid value " f"'{value}' for argument '{key}', choose from " - "{}".format(", ".join(f"'{k}'".lower() for k in valid[key])), + "{}".format(", ".join(f"'{k}'".lower() for k in valid[key])) ) raise RecipeError(msg) -def _verify_span_value(span): +def _verify_span_value(span: str) -> None: """Raise error if span argument cannot be verified.""" valid_names = ("overlap", "full") if span not in valid_names: @@ -319,12 +327,10 @@ def _verify_span_value(span): f"`multi_model_statistics`. Valid values are {valid_names}." f"Got {span}." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) -def _verify_groupby(groupby): +def _verify_groupby(groupby: Any) -> None: """Raise error if groupby arguments cannot be verified.""" if not isinstance(groupby, list): msg = ( @@ -335,7 +341,7 @@ def _verify_groupby(groupby): raise RecipeError(msg) -def _verify_keep_input_datasets(keep_input_datasets): +def _verify_keep_input_datasets(keep_input_datasets: Any) -> None: if not isinstance(keep_input_datasets, bool): msg = ( f"Invalid value encountered for `keep_input_datasets`." @@ -345,7 +351,7 @@ def _verify_keep_input_datasets(keep_input_datasets): raise RecipeError(msg) -def _verify_ignore_scalar_coords(ignore_scalar_coords): +def _verify_ignore_scalar_coords(ignore_scalar_coords: Any) -> None: if not isinstance(ignore_scalar_coords, bool): msg = ( f"Invalid value encountered for `ignore_scalar_coords`." @@ -355,13 +361,13 @@ def _verify_ignore_scalar_coords(ignore_scalar_coords): raise RecipeError(msg) -def multimodel_statistics_preproc(settings): +def multimodel_statistics_preproc(settings: dict[str, Any]) -> None: """Check that the multi-model settings are valid.""" - span = settings.get("span", None) # optional, default: overlap + span = settings.get("span") # optional, default: overlap if span: _verify_span_value(span) - groupby = settings.get("groupby", None) # optional, default: None + groupby = settings.get("groupby") # optional, default: None if groupby: _verify_groupby(groupby) @@ -372,7 +378,7 @@ def multimodel_statistics_preproc(settings): _verify_ignore_scalar_coords(ignore_scalar_coords) -def ensemble_statistics_preproc(settings): +def ensemble_statistics_preproc(settings: dict[str, Any]) -> None: """Check that the ensemble settings are valid.""" span = settings.get("span", "overlap") # optional, default: overlap if span: @@ -382,7 +388,7 @@ def ensemble_statistics_preproc(settings): _verify_ignore_scalar_coords(ignore_scalar_coords) -def _check_delimiter(timerange): +def _check_delimiter(timerange: Sequence[str]) -> None: if len(timerange) != 2: msg = ( "Invalid value encountered for `timerange`. " @@ -392,7 +398,7 @@ def _check_delimiter(timerange): raise RecipeError(msg) -def _check_duration_periods(timerange): +def _check_duration_periods(timerange: list[str]) -> None: # isodate duration must always start with P if timerange[0].startswith("P") and timerange[1].startswith("P"): msg = ( @@ -422,14 +428,14 @@ def _check_duration_periods(timerange): raise RecipeError(msg) from exc -def _check_format_years(date): +def _check_format_years(date: str) -> str: if date != "*" and not date.startswith("P"): if len(date) < 4: date = date.zfill(4) return date -def _check_timerange_values(date, timerange): +def _check_timerange_values(date: str, timerange: Iterable[str]) -> None: # Wildcards are fine if date == "*": return @@ -453,18 +459,21 @@ def _check_timerange_values(date, timerange): raise RecipeError(msg) from exc -def valid_time_selection(timerange): +def valid_time_selection(timerange: str) -> None: """Check that `timerange` tag is well defined.""" if timerange != "*": - timerange = timerange.split("/") - _check_delimiter(timerange) - _check_duration_periods(timerange) - for date in timerange: + timerange_list: list[str] = timerange.split("/") + _check_delimiter(timerange_list) + _check_duration_periods(timerange_list) + for date in timerange_list: date = _check_format_years(date) - _check_timerange_values(date, timerange) + _check_timerange_values(date, timerange_list) -def differing_timeranges(timeranges, required_vars): +def differing_timeranges( + timeranges: set[FacetValue], + required_vars: list[Facets], +) -> None: """Log error if required variables have differing timeranges.""" if len(timeranges) > 1: msg = ( @@ -472,9 +481,7 @@ def differing_timeranges(timeranges, required_vars): f"found for required variables {required_vars}. " "Set `timerange` to a common value." ) - raise ValueError( - msg, - ) + raise ValueError(msg) def _check_literal( @@ -596,7 +603,7 @@ def statistics_preprocessors(settings: dict) -> None: _check_regular_stat(step, step_settings) -def _check_regular_stat(step, step_settings): +def _check_regular_stat(step: str, step_settings: dict[str, Any]) -> None: """Check regular statistics (non-multi-model statistics) step.""" step_settings = dict(step_settings) @@ -632,7 +639,7 @@ def _check_regular_stat(step, step_settings): raise RecipeError(msg) from exc -def _check_mm_stat(step, step_settings): +def _check_mm_stat(step: str, step_settings: dict[str, Any]) -> None: """Check multi-model statistic step.""" statistics = step_settings.get("statistics", []) for stat in statistics: @@ -647,7 +654,7 @@ def _check_mm_stat(step, step_settings): raise RecipeError(msg) from exc -def regridding_schemes(settings: dict): +def regridding_schemes(settings: dict[str, Any]) -> None: """Check :obj:`str` regridding schemes.""" if "regrid" not in settings: return diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 08ef4680db..ee332e8927 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -581,7 +581,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]: datasets.append(input_dataset) # Check timeranges of available input data. - timeranges = set() + timeranges: set[FacetValue] = set() for input_dataset in datasets: if "timerange" in input_dataset.facets: timeranges.add(input_dataset.facets["timerange"]) From b6a6651ff4c15b346ba064736de1e52bd51d9567 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 16:40:08 +0200 Subject: [PATCH 06/85] Added type hints for recipe.py --- esmvalcore/_recipe/check.py | 8 +- esmvalcore/_recipe/recipe.py | 233 ++++++++++++++++++---------- esmvalcore/preprocessor/__init__.py | 6 +- 3 files changed, 152 insertions(+), 95 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index dd37a31893..e79db4f2ee 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -149,9 +149,7 @@ def variable( f"Missing keys {missing} in\n{pformat(var)}\nfor variable " f"'{variable_group}' in diagnostic '{diagnostic}'." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def _log_data_availability_errors(dataset: Dataset) -> None: @@ -228,9 +226,7 @@ def data_availability(dataset: Dataset, log: bool = True) -> None: missing_txt, "\n".join(str(f) for f in input_files), ) - raise InputFilesNotFound( - msg, - ) + raise InputFilesNotFound(msg) def preprocessor_supplementaries( diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 42b63962ed..0c61307c0b 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -16,7 +16,7 @@ from esmvalcore import __version__, esgf from esmvalcore._provenance import get_recipe_provenance -from esmvalcore._task import DiagnosticTask, ResumeTask, TaskSet +from esmvalcore._task import BaseTask, DiagnosticTask, ResumeTask, TaskSet from esmvalcore.config._config import TASKSEP from esmvalcore.config._dask import validate_dask_config from esmvalcore.config._diagnostics import TAGS @@ -57,7 +57,10 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable, Sequence + from collections.abc import Iterable + + from esmvalcore.config import Session + from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -70,7 +73,7 @@ """Use a global variable to keep track of datasets that are actually used.""" -def read_recipe_file(filename: Path, session): +def read_recipe_file(filename: Path, session: Session) -> Recipe: """Read a recipe from file.""" check.recipe_with_schema(filename) with open(filename, encoding="utf-8") as file: @@ -79,7 +82,7 @@ def read_recipe_file(filename: Path, session): return Recipe(raw_recipe, session, recipe_file=filename) -def _special_name_to_dataset(facets, special_name): +def _special_name_to_dataset(facets: Facets, special_name: str) -> str: """Convert special names to dataset names.""" if special_name in ("reference_dataset", "alternative_dataset"): if special_name not in facets: @@ -93,15 +96,17 @@ def _special_name_to_dataset(facets, special_name): diagnostic=facets["diagnostic"], ) ) - raise RecipeError( - msg, - ) - special_name = facets[special_name] + raise RecipeError(msg) + dataset_name = str(facets[special_name]) - return special_name + return dataset_name -def _update_target_levels(dataset, datasets, settings): +def _update_target_levels( + dataset: Dataset, + datasets: list[Dataset], + settings: dict[str, Any], +) -> None: """Replace the target levels dataset name with a filename if needed.""" levels = settings.get("extract_levels", {}).get("levels") if not levels: @@ -135,7 +140,11 @@ def _update_target_levels(dataset, datasets, settings): ) -def _update_target_grid(dataset, datasets, settings): +def _update_target_grid( + dataset: Dataset, + datasets: list[Dataset], + settings: dict[str, Any], +) -> None: """Replace the target grid dataset name with a filename if needed.""" grid = settings.get("regrid", {}).get("target_grid") if not grid: @@ -169,7 +178,7 @@ def _update_regrid_time(dataset: Dataset, settings: dict) -> None: settings["regrid_time"]["frequency"] = dataset.facets["frequency"] -def _select_dataset(dataset_name, datasets): +def _select_dataset(dataset_name: str, datasets: list[Dataset]) -> Dataset: for dataset in datasets: if dataset.facets["dataset"] == dataset_name: return dataset @@ -179,12 +188,13 @@ def _select_dataset(dataset_name, datasets): f"Unable to find dataset '{dataset_name}' in the list of datasets" f"for variable '{variable_group}' of diagnostic '{diagnostic}'." ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) -def _limit_datasets(datasets, profile): +def _limit_datasets( + datasets: list[Dataset], + profile: dict[str, Any], +) -> list[Dataset]: """Try to limit the number of datasets to max_datasets.""" max_datasets = datasets[0].session["max_datasets"] if not max_datasets: @@ -208,13 +218,13 @@ def _limit_datasets(datasets, profile): logger.info( "Only considering %s", - ", ".join(d.facets["alias"] for d in limited), + ", ".join(str(d.facets["alias"]) for d in limited), ) return limited -def _get_default_settings(dataset): +def _get_default_settings(dataset: Dataset) -> dict[str, Any]: """Get default preprocessor settings.""" session = dataset.session facets = dataset.facets @@ -243,7 +253,10 @@ def _get_default_settings(dataset): return settings -def _add_dataset_specific_settings(dataset: Dataset, settings: dict) -> None: +def _add_dataset_specific_settings( + dataset: Dataset, + settings: dict[str, Any], +) -> None: """Add dataset-specific settings.""" project = dataset.facets["project"] dataset_name = dataset.facets["dataset"] @@ -271,7 +284,11 @@ def _add_dataset_specific_settings(dataset: Dataset, settings: dict) -> None: ) -def _exclude_dataset(settings, facets, step): +def _exclude_dataset( + settings: dict[str, Any], + facets: Facets, + step: str, +) -> None: """Exclude dataset from specific preprocessor step if requested.""" exclude = { _special_name_to_dataset(facets, dataset) @@ -286,14 +303,17 @@ def _exclude_dataset(settings, facets, step): ) -def _update_weighting_settings(settings, facets): +def _update_weighting_settings( + settings: dict[str, Any], + facets: Facets, +) -> None: """Update settings for the weighting preprocessors.""" if "weighting_landsea_fraction" not in settings: return _exclude_dataset(settings, facets, "weighting_landsea_fraction") -def _add_to_download_list(dataset): +def _add_to_download_list(dataset: Dataset) -> None: """Add the files of `dataset` to `DOWNLOAD_FILES`.""" for i, file in enumerate(dataset.files): if isinstance(file, esgf.ESGFFile): @@ -301,7 +321,7 @@ def _add_to_download_list(dataset): dataset.files[i] = file.local_file(dataset.session["download_dir"]) -def _schedule_for_download(datasets): +def _schedule_for_download(datasets: list[Dataset]) -> None: """Schedule files for download.""" for dataset in datasets: _add_to_download_list(dataset) @@ -354,14 +374,16 @@ def _check_input_files(input_datasets: Iterable[Dataset]) -> set[str]: return missing -def _apply_preprocessor_profile(settings, profile_settings): +def _apply_preprocessor_profile( + settings: dict[str, Any], + profile_settings: dict[str, Any], +) -> None: """Apply settings from preprocessor profile.""" profile_settings = deepcopy(profile_settings) for step, args in profile_settings.items(): # Remove disabled preprocessor functions if args is False: - if step in settings: - del settings[step] + settings.pop(step, None) continue # Enable/update functions without keywords if step not in settings: @@ -370,9 +392,12 @@ def _apply_preprocessor_profile(settings, profile_settings): settings[step].update(args) -def _get_common_attributes(products, settings): +def _get_common_attributes( + products: set[PreprocessorFile], + settings: dict[str, Any], +) -> dict[str, Any]: """Get common attributes for the output products.""" - attributes = {} + attributes: dict[str, Any] = {} some_product = next(iter(products)) for key, value in some_product.attributes.items(): if all(p.attributes.get(key, object()) == value for p in products): @@ -420,7 +445,11 @@ def _get_common_attributes(products, settings): return attributes -def _get_downstream_settings(step, order, products): +def _get_downstream_settings( + step: str, + order: tuple[str, ...], + products: set[PreprocessorFile], +) -> dict[str, Any]: """Get downstream preprocessor settings shared between products.""" settings = {} remaining_steps = order[order.index(step) + 1 :] @@ -434,7 +463,10 @@ def _get_downstream_settings(step, order, products): return settings -def _update_multi_dataset_settings(facets, settings): +def _update_multi_dataset_settings( + facets: Facets, + settings: dict[str, Any], +) -> None: """Configure multi dataset statistics.""" for step in MULTI_MODEL_FUNCTIONS: if not settings.get(step): @@ -443,7 +475,7 @@ def _update_multi_dataset_settings(facets, settings): _exclude_dataset(settings, facets, step) -def _get_tag(step, identifier, statistic): +def _get_tag(step: str, identifier: str, statistic: str) -> str: # Avoid . in filename for percentiles statistic = statistic.replace(".", "-") @@ -457,7 +489,12 @@ def _get_tag(step, identifier, statistic): return tag -def _update_multiproduct(input_products, order, preproc_dir, step): +def _update_multiproduct( + input_products: set[PreprocessorFile], + order: tuple[str, ...], + preproc_dir: Path, + step: str, +) -> tuple[set[PreprocessorFile], dict[str, Any]]: """Return new products that are aggregated over multiple datasets. These new products will replace the original products at runtime. @@ -483,7 +520,7 @@ def _update_multiproduct(input_products, order, preproc_dir, step): downstream_settings = _get_downstream_settings(step, order, multiproducts) - relevant_settings = { + relevant_settings: dict[str, Any] = { "output_products": defaultdict(dict), } # pass to ancestors @@ -524,7 +561,11 @@ def _update_multiproduct(input_products, order, preproc_dir, step): return output_products, relevant_settings -def update_ancestors(ancestors, step, downstream_settings): +def update_ancestors( + ancestors: set[PreprocessorFile], + step: str, + downstream_settings: dict[str, Any], +) -> None: """Retroactively add settings to ancestor products.""" for product in ancestors: if step in product.settings: @@ -533,7 +574,7 @@ def update_ancestors(ancestors, step, downstream_settings): settings[key] = value -def _update_extract_shape(settings, session): +def _update_extract_shape(settings: dict[str, Any], session: Session) -> None: if "extract_shape" in settings: shapefile = settings["extract_shape"].get("shapefile") if shapefile: @@ -542,7 +583,7 @@ def _update_extract_shape(settings, session): check.extract_shape(settings["extract_shape"]) -def _allow_skipping(dataset: Dataset): +def _allow_skipping(dataset: Dataset) -> bool: """Allow skipping of datasets.""" return all( [ @@ -553,7 +594,7 @@ def _allow_skipping(dataset: Dataset): ) -def _set_version(dataset: Dataset, input_datasets: list[Dataset]): +def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: """Set the 'version' facet based on derivation input datasets.""" versions = set() for in_dataset in input_datasets: @@ -573,7 +614,7 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]): def _get_preprocessor_products( datasets: list[Dataset], profile: dict[str, Any], - order: list[str], + order: tuple[str, ...], name: str, ) -> set[PreprocessorFile]: """Get preprocessor product definitions for a set of datasets. @@ -629,9 +670,7 @@ def _get_preprocessor_products( f"Missing data for preprocessor {name}:{separator}" f"{separator.join(sorted(missing_vars))}" ) - raise InputFilesNotFound( - msg, - ) + raise InputFilesNotFound(msg) check.reference_for_bias_preproc(products) check.reference_for_distance_metric_preproc(products) @@ -651,11 +690,11 @@ def _get_preprocessor_products( def _configure_multi_product_preprocessor( - products: Iterable[PreprocessorFile], + products: set[PreprocessorFile], preproc_dir: Path, profile: PreprocessorSettings, - order: Sequence[str], -): + order: tuple[str, ...], +) -> None: """Configure preprocessing of ensemble and multimodel statistics.""" ensemble_step = "ensemble_statistics" multi_model_step = "multi_model_statistics" @@ -718,7 +757,12 @@ def _set_start_end_year(product: PreprocessorFile) -> None: product.attributes["end_year"] = int(str(end_year[0:4])) -def _update_preproc_functions(settings, dataset, datasets, missing_vars): +def _update_preproc_functions( + settings: dict[str, Any], + dataset: Dataset, + datasets: list[Dataset], + missing_vars: set[str], +) -> None: session = dataset.session _update_extract_shape(settings, session) _update_weighting_settings(settings, dataset.facets) @@ -748,20 +792,22 @@ def _update_preproc_functions(settings, dataset, datasets, missing_vars): check.resample_hours(settings) -def _get_preprocessor_task(datasets, profiles, task_name): +def _get_preprocessor_task( + datasets: list[Dataset], + profiles: dict[str, Any], + task_name: str, +) -> PreprocessingTask: """Create preprocessor task(s) for a set of datasets.""" # First set up the preprocessor profile facets = datasets[0].facets session = datasets[0].session - preprocessor = facets.get("preprocessor", "default") + preprocessor = str(facets.get("preprocessor", "default")) if preprocessor not in profiles: msg = ( f"Unknown preprocessor '{preprocessor}' in variable " f"{facets['variable_group']} of diagnostic {facets['diagnostic']}" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) logger.info( "Creating preprocessor '%s' task for variable '%s'", preprocessor, @@ -800,7 +846,7 @@ def _get_preprocessor_task(datasets, profiles, task_name): return task -def _extract_preprocessor_order(profile): +def _extract_preprocessor_order(profile: dict[str, Any]) -> tuple[str, ...]: """Extract the order of the preprocessing steps from the profile.""" custom_order = profile.pop("custom_order", False) if not custom_order: @@ -816,7 +862,12 @@ def _extract_preprocessor_order(profile): class Recipe: """Recipe object.""" - def __init__(self, raw_recipe, session, recipe_file: Path): + def __init__( + self, + raw_recipe: dict[str, Any], + session: Session, + recipe_file: Path, + ) -> None: """Parse a recipe file into an object.""" validate_dask_config(session["dask"]) @@ -846,7 +897,7 @@ def __init__(self, raw_recipe, session, recipe_file: Path): self._log_recipe_errors(exc) raise - def _log_recipe_errors(self, exc): + def _log_recipe_errors(self, exc: RecipeError) -> None: """Log a message with recipe errors.""" logger.error(exc.message) for task in exc.failed_tasks: @@ -880,7 +931,7 @@ def _log_recipe_errors(self, exc): ) @staticmethod - def _need_ncl(raw_diagnostics): + def _need_ncl(raw_diagnostics: dict[str, Any]) -> bool: if not raw_diagnostics: return False for diagnostic in raw_diagnostics.values(): @@ -893,7 +944,7 @@ def _need_ncl(raw_diagnostics): return True return False - def _initialize_provenance(self, raw_documentation): + def _initialize_provenance(self, raw_documentation: dict[str, Any]): """Initialize the recipe provenance.""" doc = deepcopy(raw_documentation) @@ -901,7 +952,10 @@ def _initialize_provenance(self, raw_documentation): return get_recipe_provenance(doc, self._filename) - def _initialize_diagnostics(self, raw_diagnostics): + def _initialize_diagnostics( + self, + raw_diagnostics: dict[str, Any], + ) -> dict[str, Any]: """Define diagnostics in recipe.""" logger.debug("Retrieving diagnostics from recipe") check.diagnostics(raw_diagnostics) @@ -909,7 +963,7 @@ def _initialize_diagnostics(self, raw_diagnostics): diagnostics = {} for name, raw_diagnostic in raw_diagnostics.items(): - diagnostic = {} + diagnostic: dict[str, Any] = {} diagnostic["name"] = name diagnostic["datasets"] = [ ds for ds in self.datasets if ds.facets["diagnostic"] == name @@ -930,10 +984,10 @@ def _initialize_diagnostics(self, raw_diagnostics): def _initialize_scripts( self, - diagnostic_name, - raw_scripts, - variable_names, - ): + diagnostic_name: str, + raw_scripts: dict[str, Any], + variable_names: tuple[str, Any], + ) -> dict[str, Any]: """Define script in diagnostic.""" if not raw_scripts: return {} @@ -982,7 +1036,10 @@ def _initialize_scripts( return scripts - def _resolve_diagnostic_ancestors(self, tasks): + def _resolve_diagnostic_ancestors( + self, + tasks: Iterable[PreprocessingTask], + ) -> None: """Resolve diagnostic ancestors.""" tasks = {t.name: t for t in tasks} for diagnostic_name, diagnostic in self.diagnostics.items(): @@ -997,7 +1054,7 @@ def _resolve_diagnostic_ancestors(self, tasks): diagnostic_name, script_name, ) - ancestors = [] + ancestors: list[BaseTask] = [] for id_glob in script_cfg["ancestors"]: ancestor_ids = fnmatch.filter(tasks, id_glob) if not ancestor_ids: @@ -1014,7 +1071,7 @@ def _resolve_diagnostic_ancestors(self, tasks): ancestors.extend(tasks[a] for a in ancestor_ids) tasks[task_id].ancestors = ancestors - def _get_tasks_to_run(self): + def _get_tasks_to_run(self) -> set[str]: """Get tasks filtered and add ancestors if needed.""" tasknames_to_run = self.session["diagnostics"] if tasknames_to_run: @@ -1023,7 +1080,7 @@ def _get_tasks_to_run(self): pass return tasknames_to_run - def _update_with_ancestors(self, tasknames_to_run): + def _update_with_ancestors(self, tasknames_to_run: set[str]) -> bool: """Add ancestors for all selected tasks.""" num_filters = len(tasknames_to_run) @@ -1055,12 +1112,12 @@ def _update_with_ancestors(self, tasknames_to_run): def _create_diagnostic_tasks( self, - diagnostic_name, - diagnostic, - tasknames_to_run, - ): + diagnostic_name: str, + diagnostic: dict[str, Any], + tasknames_to_run: set[str], + ) -> list[BaseTask]: """Create diagnostic tasks.""" - tasks = [] + tasks: list[BaseTask] = [] if self.session["run_diagnostic"]: for script_name, script_cfg in diagnostic["scripts"].items(): @@ -1091,14 +1148,14 @@ def _create_diagnostic_tasks( def _create_preprocessor_tasks( self, - diagnostic_name, - diagnostic, - tasknames_to_run, - any_diag_script_is_run, - ): + diagnostic_name: str, + diagnostic: dict[str, Any], + tasknames_to_run: set[str], + any_diag_script_is_run: bool, + ) -> tuple[list[BaseTask], list[RecipeError]]: """Create preprocessor tasks.""" - tasks = [] - failed_tasks = [] + tasks: list[BaseTask] = [] + failed_tasks: list[RecipeError] = [] for variable_group, datasets in groupby( diagnostic["datasets"], key=lambda ds: ds.facets["variable_group"], @@ -1138,7 +1195,11 @@ def _create_preprocessor_tasks( diagnostic_name, variable_group, ) - task = ResumeTask(prev_preproc_dir, preproc_dir, task_name) + task: BaseTask = ResumeTask( + prev_preproc_dir, + preproc_dir, + task_name, + ) tasks.append(task) break else: @@ -1156,7 +1217,7 @@ def _create_preprocessor_tasks( return tasks, failed_tasks - def _create_tasks(self): + def _create_tasks(self) -> TaskSet: """Create tasks from the recipe.""" logger.info("Creating tasks from recipe") tasks = TaskSet() @@ -1208,7 +1269,7 @@ def _create_tasks(self): return tasks - def initialize_tasks(self): + def initialize_tasks(self) -> TaskSet: """Define tasks in recipe.""" tasks = self._create_tasks() tasks = tasks.flatten() @@ -1227,11 +1288,11 @@ def initialize_tasks(self): # Return smallest possible set of tasks return tasks.get_independent() - def __str__(self): + def __str__(self) -> str: """Get human readable summary.""" return "\n\n".join(str(task) for task in self.tasks) - def run(self): + def run(self) -> None: """Run all tasks in the recipe.""" if not self.tasks: msg = "No tasks to run!" @@ -1249,7 +1310,7 @@ def run(self): ) self.write_html_summary() - def get_output(self) -> dict: + def get_output(self) -> dict[str, Any]: """Return the paths to the output plots and data. Returns @@ -1257,7 +1318,7 @@ def get_output(self) -> dict: product_filenames : dict Lists of products/attributes grouped by task. """ - output = {} + output: dict[str, Any] = {} output["session"] = self.session output["recipe_filename"] = self._filename @@ -1275,7 +1336,7 @@ def get_output(self) -> dict: return output - def write_filled_recipe(self): + def write_filled_recipe(self) -> Path: """Write copy of recipe with filled wildcards.""" recipe = datasets_to_recipe(USED_DATASETS, self._raw_recipe) filename = self.session.run_dir / f"{self._filename.stem}_filled.yml" @@ -1287,7 +1348,7 @@ def write_filled_recipe(self): ) return filename - def write_html_summary(self): + def write_html_summary(self) -> None: """Write summary html file to the output dir.""" with warnings.catch_warnings(): # ignore import warnings diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index da5eea2f78..8732e11f83 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -223,14 +223,14 @@ """ # The order of initial and final steps cannot be configured -INITIAL_STEPS = DEFAULT_ORDER[ +INITIAL_STEPS: tuple[str, ...] = DEFAULT_ORDER[ : DEFAULT_ORDER.index("add_supplementary_variables") + 1 ] -FINAL_STEPS = DEFAULT_ORDER[ +FINAL_STEPS: tuple[str, ...] = DEFAULT_ORDER[ DEFAULT_ORDER.index("remove_supplementary_variables") : ] -MULTI_MODEL_FUNCTIONS = { +MULTI_MODEL_FUNCTIONS: set[str] = { "bias", "distance_metric", "ensemble_statistics", From 6793e0c6e71bfa95e281383eb2d9a8d42f1bb5ad Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 16:56:34 +0200 Subject: [PATCH 07/85] Added type hints for to_datasets.py --- esmvalcore/_recipe/to_datasets.py | 39 +++++++++++++++++-------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index ee332e8927..7a78ac439f 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -30,7 +30,7 @@ logger = logging.getLogger(__name__) -_ALIAS_INFO_KEYS = ( +_ALIAS_INFO_KEYS: tuple[str, ...] = ( "project", "activity", "driver", @@ -43,7 +43,7 @@ """List of keys to be used to compose the alias, ordered by priority.""" -def _facet_to_str(facet_value: FacetValue) -> str: +def _facet_to_str(facet_value: FacetValue | None) -> str: """Get a string representation of a facet value.""" if isinstance(facet_value, str): return facet_value @@ -52,7 +52,7 @@ def _facet_to_str(facet_value: FacetValue) -> str: return str(facet_value) -def _set_alias(variables): +def _set_alias(variables: list[list[Dataset]]) -> None: """Add unique alias for datasets. Generates a unique alias for each dataset that will be shared by all @@ -99,41 +99,46 @@ def _set_alias(variables): variables : list for each recipe variable, a list of datasets """ - datasets_info = set() + datasets_info: set[tuple[str, ...]] = set() for variable in variables: for dataset in variable: - alias = tuple( + alias_tuple = tuple( _facet_to_str(dataset.facets.get(key, None)) for key in _ALIAS_INFO_KEYS ) - datasets_info.add(alias) + datasets_info.add(alias_tuple) if "alias" not in dataset.facets: - dataset.facets["alias"] = alias + dataset.facets["alias"] = alias_tuple - alias = {} + alias: dict[tuple[str, ...], list[Any]] = {} for info in datasets_info: alias[info] = [] - datasets_info = list(datasets_info) - _get_next_alias(alias, datasets_info, 0) + datasets_info_list: list[tuple[str, ...]] = list(datasets_info) + _get_next_alias(alias, datasets_info_list, 0) - for info in datasets_info: - alias[info] = "_".join( + final_alias: dict[tuple[str, ...], str] = {} + for info in datasets_info_list: + final_alias[info] = "_".join( [str(value) for value in alias[info] if value is not None], ) - if not alias[info]: - alias[info] = info[_ALIAS_INFO_KEYS.index("dataset")] + if not final_alias[info]: + final_alias[info] = info[_ALIAS_INFO_KEYS.index("dataset")] for variable in variables: for dataset in variable: - dataset.facets["alias"] = alias.get( + dataset.facets["alias"] = final_alias.get( # type: ignore dataset.facets["alias"], dataset.facets["alias"], ) -def _get_next_alias(alias, datasets_info, i): +def _get_next_alias( + alias: dict[tuple[str, ...], list[Any]], + datasets_info: list[tuple[str, ...]], + i: int, +) -> None: if i >= len(_ALIAS_INFO_KEYS): return key_values = {info[i] for info in datasets_info} @@ -185,7 +190,7 @@ def _merge_supplementary_dicts( return list(merged.values()) -def _fix_cmip5_fx_ensemble(dataset: Dataset): +def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: """Automatically correct the wrong ensemble for CMIP5 fx variables.""" if ( dataset.facets.get("project") == "CMIP5" From 878e3104cf87c7dc4adde57ad2ade4ce92d2f291 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 17:08:40 +0200 Subject: [PATCH 08/85] Added type hints for dataset.py --- esmvalcore/_recipe/to_datasets.py | 4 +-- esmvalcore/dataset.py | 46 +++++++++++++++++++------------ 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 7a78ac439f..7cd17bdbb0 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -164,9 +164,7 @@ def _check_supplementaries_valid(supplementaries: Iterable[Facets]) -> None: "'short_name' is required for supplementary_variables " f"entries, but missing in {facets}" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) def _merge_supplementary_dicts( diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 65169e1644..d471dde5d8 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -67,7 +67,7 @@ """ -def _augment(base: dict, update: dict): +def _augment(base: dict, update: dict) -> None: """Update dict `base` with values from dict `update`.""" for key in update: if key not in base: @@ -108,7 +108,7 @@ class Dataset: Facets describing the dataset. """ - _SUMMARY_FACETS = ( + _SUMMARY_FACETS: tuple[str, ...] = ( "short_name", "mip", "project", @@ -124,7 +124,7 @@ class Dataset: ) """Facets used to create a summary of a Dataset instance.""" - def __init__(self, **facets: FacetValue): + def __init__(self, **facets: FacetValue) -> None: self.facets: Facets = {} self.supplementaries: list[Dataset] = [] @@ -478,7 +478,7 @@ def copy(self, **facets: FacetValue) -> Dataset: return new - def __eq__(self, other) -> bool: + def __eq__(self, other: object) -> bool: """Compare with another dataset.""" return ( isinstance(other, self.__class__) @@ -575,15 +575,20 @@ def supplementary_summary(dataset): return txt - def __getitem__(self, key): + def __getitem__(self, key: Any) -> FacetValue: """Get a facet value.""" return self.facets[key] - def __setitem__(self, key, value): + def __setitem__(self, key: str, value: FacetValue) -> None: """Set a facet value.""" self.set_facet(key, value, persist=False) - def set_facet(self, key: str, value: FacetValue, persist: bool = True): + def set_facet( + self, + key: str, + value: FacetValue, + persist: bool = True, + ) -> None: """Set facet. Parameters @@ -665,9 +670,16 @@ def augment_facets(self) -> None: supplementary._augment_facets() # noqa: SLF001 @staticmethod - def _pattern_filter(patterns: Iterable[str], name: str) -> list[str]: + def _pattern_filter( + patterns: Iterable[FacetValue], + name: FacetValue, + ) -> list[str]: """Get the subset of the list `patterns` that `name` matches.""" - return [pat for pat in patterns if fnmatch.fnmatchcase(name, pat)] + return [ + str(pat) + for pat in patterns + if fnmatch.fnmatchcase(str(name), str(pat)) + ] def _get_extra_facets(self) -> dict[str, Any]: """Get extra facets of dataset.""" @@ -718,7 +730,7 @@ def _get_extra_facets(self) -> dict[str, Any]: return extra_facets - def _augment_facets(self): + def _augment_facets(self) -> None: extra_facets = self._get_extra_facets() _augment(self.facets, extra_facets) if "institute" not in self.facets: @@ -797,7 +809,7 @@ def files(self) -> list[File]: return self._files # type: ignore @files.setter - def files(self, value): + def files(self, value: Sequence[File]) -> None: self._files = value def load(self) -> Cube: @@ -939,15 +951,15 @@ def from_ranges(self) -> list[Dataset]: ] return datasets - def _expand_range(self, input_tag): + def _expand_range(self, input_tag: str) -> list[FacetValue]: """Expand ranges such as ensemble members or start dates. Expansion only supports ensembles defined as strings, not lists. """ - expanded = [] + expanded: list[FacetValue] = [] regex = re.compile(r"\(\d+:\d+\)") - def expand_range(input_range): + def expand_range(input_range) -> None: match = regex.search(input_range) if match: start, end = match.group(0)[1:-1].split(":") @@ -965,16 +977,14 @@ def expand_range(input_range): f"In {self}: {input_tag} expansion " f"cannot be combined with {input_tag} lists" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) expanded.append(tag) else: expand_range(tag) return expanded - def _update_timerange(self): + def _update_timerange(self) -> None: """Update wildcards in timerange with found datetime values. If the timerange is given as a year, it ensures it's formatted From be6e55d2e7b9577b7bd6b3ccd109357a836dd794 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 17:23:47 +0200 Subject: [PATCH 09/85] Add type hints to local.py --- esmvalcore/_recipe/recipe.py | 4 +++ esmvalcore/local.py | 62 ++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 0c61307c0b..e14d9201e2 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -412,10 +412,14 @@ def _get_common_attributes( if "timerange" not in product.attributes: continue timerange = product.attributes["timerange"] + start: int | str + end: int | str start, end = _parse_period(timerange) if "timerange" not in attributes: attributes["timerange"] = _dates_to_timerange(start, end) else: + start_date: int | str + end_date: int | str start_date, end_date = _parse_period(attributes["timerange"]) start_date, start = _truncate_dates(start_date, start) end_date, end = _truncate_dates(end_date, end) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index baeeac2757..df0a0df225 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -20,17 +20,25 @@ from .exceptions import RecipeError if TYPE_CHECKING: + from collections.abc import Iterable + from .esgf import ESGFFile from .typing import Facets, FacetValue logger = logging.getLogger(__name__) -def _get_from_pattern(pattern, date_range_pattern, stem, group): +def _get_from_pattern( + pattern: str, + date_range_pattern: str, + stem: str, + group: str, +) -> tuple[str | None, str | None]: """Get time, date or datetime from date range patterns in file names.""" # Next string allows to test that there is an allowed delimiter (or # string start or end) close to date range (or to single date) - start_point = end_point = None + start_point: str | None = None + end_point: str | None = None context = r"(?:^|[-_]|$)" # First check for a block of two potential dates @@ -170,9 +178,7 @@ def _get_start_end_date( f"File {file} datetimes do not match a recognized pattern and " f"time coordinate can not be read from the file" ) - raise ValueError( - msg, - ) + raise ValueError(msg) # Remove potential '-' characters from datetimes start_date = start_date.replace("-", "") @@ -192,7 +198,7 @@ def _get_start_end_year( return (int(start_date[:4]), int(end_date[:4])) -def _dates_to_timerange(start_date, end_date): +def _dates_to_timerange(start_date: int | str, end_date: int | str) -> str: """Convert ``start_date`` and ``end_date`` to ``timerange``. Note @@ -203,9 +209,9 @@ def _dates_to_timerange(start_date, end_date): Parameters ---------- - start_date: int or str + start_date: Start date. - end_date: int or str + end_date: End date. Returns @@ -225,7 +231,7 @@ def _dates_to_timerange(start_date, end_date): return f"{start_date}/{end_date}" -def _replace_years_with_timerange(variable): +def _replace_years_with_timerange(variable: dict[str, Any]) -> None: """Set `timerange` tag from tags `start_year` and `end_year`.""" start_year = variable.get("start_year") end_year = variable.get("end_year") @@ -293,7 +299,7 @@ def _parse_period(timerange: FacetValue) -> tuple[str, str]: return start_date, end_date -def _truncate_dates(date, file_date): +def _truncate_dates(date: str, file_date: str) -> tuple[int, int]: """Truncate dates of different lengths and convert to integers. This allows to compare the dates chronologically. For example, this allows @@ -317,7 +323,10 @@ def _truncate_dates(date, file_date): return int(date), int(file_date) -def _select_files(filenames, timerange): +def _select_files( + filenames: list[LocalFile], + timerange: list, +) -> list[LocalFile]: """Select files containing data between a given timerange. If the timerange is given as a period, the file selection occurs @@ -333,6 +342,10 @@ def _select_files(filenames, timerange): selection = [] for filename in filenames: + start: int | str + end: int | str + start_date: int | str + end_date: int | str start_date, end_date = _parse_period(timerange) start, end = _get_start_end_date(filename) @@ -349,6 +362,7 @@ def _replace_tags( variable: Facets, ) -> list[Path]: """Replace tags in the config-developer's file with actual values.""" + pathset: Iterable[str] if isinstance(paths, str): pathset = {paths.strip("/")} else: @@ -386,10 +400,14 @@ def _replace_tags( return [Path(p) for p in pathset] -def _replace_tag(paths, tag, replacewith): +def _replace_tag( + paths: Iterable[str], + tag: str, + replacewith: FacetValue, +) -> list[str]: """Replace tag by replacewith in paths.""" _, lower, upper = _get_caps_options(tag) - result = [] + result: list[str] = [] if isinstance(replacewith, (list, tuple)): for item in replacewith: result.extend(_replace_tag(paths, tag, item)) @@ -399,7 +417,7 @@ def _replace_tag(paths, tag, replacewith): return list(set(result)) -def _get_caps_options(tag): +def _get_caps_options(tag: str) -> tuple[str, bool, bool]: lower = False upper = False if tag.endswith(".lower"): @@ -411,7 +429,7 @@ def _get_caps_options(tag): return tag, lower, upper -def _apply_caps(original, lower, upper): +def _apply_caps(original: str, lower: bool, upper: bool) -> str: if lower: return original.lower() if upper: @@ -433,9 +451,7 @@ def _select_drs(input_type: str, project: str, structure: str) -> list[str]: return value msg = f"drs {structure} for {project} project not specified in config-developer file" - raise KeyError( - msg, - ) + raise KeyError(msg) @dataclass(order=True) @@ -470,7 +486,7 @@ def find_files(self, **facets) -> list[LocalFile]: globs = self.get_glob_patterns(**facets) logger.debug("Looking for files matching %s", globs) - files = [] + files: list[LocalFile] = [] for glob_ in globs: for filename in glob(str(glob_)): file = LocalFile(filename) @@ -579,7 +595,7 @@ def _templates_to_regex(self) -> str: return pattern -_ROOTPATH_WARNED = set() +_ROOTPATH_WARNED: set[tuple[str, tuple[str]]] = set() def _get_data_sources(project: str) -> list[DataSource]: @@ -615,9 +631,7 @@ def _get_data_sources(project: str) -> list[DataSource]: f"No '{project}' or 'default' path specified under 'rootpath' in " "the configuration." ) - raise KeyError( - msg, - ) + raise KeyError(msg) def _get_output_file(variable: dict[str, Any], preproc_dir: Path) -> Path: @@ -831,5 +845,5 @@ def facets(self) -> Facets: return self._facets @facets.setter - def facets(self, value: Facets): + def facets(self, value: Facets) -> None: self._facets = value From b1caf65eb63f75bd41bf4605b672863b69ffd2f5 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 17:42:37 +0200 Subject: [PATCH 10/85] Add type hints to preprocessor/__init__.py --- esmvalcore/preprocessor/__init__.py | 125 +++++++++++++++------------- 1 file changed, 68 insertions(+), 57 deletions(-) diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 8732e11f83..bbc8313127 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -9,7 +9,7 @@ from pprint import pformat from typing import TYPE_CHECKING, Any -from iris.cube import Cube +from iris.cube import Cube, CubeList from esmvalcore._provenance import TrackedFile from esmvalcore._task import BaseTask @@ -95,10 +95,12 @@ from ._weighting import weighting_landsea_fraction if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Callable, Iterable from dask.delayed import Delayed + from esmvalcore.dataset import Dataset, File + logger = logging.getLogger(__name__) __all__ = [ @@ -202,7 +204,7 @@ "save", ] -TIME_PREPROCESSORS = [ +TIME_PREPROCESSORS: list[str] = [ "clip_timerange", "extract_time", "extract_season", @@ -217,7 +219,7 @@ "regrid_time", ] -DEFAULT_ORDER = tuple(__all__) +DEFAULT_ORDER: tuple[str, ...] = tuple(__all__) """ By default, preprocessor functions are applied in this order. """ @@ -240,13 +242,13 @@ } -def _get_itype(step): +def _get_itype(step: str) -> str: """Get the input type of a preprocessor function.""" function = globals()[step] return next(iter(inspect.signature(function).parameters)) -def check_preprocessor_settings(settings): +def check_preprocessor_settings(settings: dict[str, Any]) -> None: """Check preprocessor settings.""" for step in settings: if step not in DEFAULT_ORDER: @@ -254,9 +256,7 @@ def check_preprocessor_settings(settings): f"Unknown preprocessor function '{step}', choose from: " f"{', '.join(DEFAULT_ORDER)}" ) - raise ValueError( - msg, - ) + raise ValueError(msg) function = globals()[step] @@ -292,9 +292,7 @@ def check_preprocessor_settings(settings): f"encountered for preprocessor function {step}. \n" f"Valid arguments are: [{', '.join(args)}]" ) - raise ValueError( - msg, - ) + raise ValueError(msg) # Check for missing arguments defaults = [ @@ -309,9 +307,7 @@ def check_preprocessor_settings(settings): f"Missing required argument(s) {missing_args} for " f"preprocessor function {step}" ) - raise ValueError( - msg, - ) + raise ValueError(msg) # Final sanity check in case the above fails to catch a mistake try: @@ -324,7 +320,7 @@ def check_preprocessor_settings(settings): raise -def _check_multi_model_settings(products): +def _check_multi_model_settings(products: Iterable[PreprocessorFile]) -> None: """Check that multi dataset settings are identical for all products.""" multi_model_steps = ( step @@ -345,16 +341,17 @@ def _check_multi_model_settings(products): f"{reference.filename} and {product.filename}, " f"{reference.settings[step]} and {settings}" ) - raise ValueError( - msg, - ) + raise ValueError(msg) -def _get_multi_model_settings(products, step): +def _get_multi_model_settings( + products: set[PreprocessorFile], + step: str, +) -> tuple[dict[str, Any], set[PreprocessorFile]]: """Select settings for multi model step.""" _check_multi_model_settings(products) settings = {} - exclude = set() + exclude: set[PreprocessorFile] = set() for product in products: if step in product.settings: settings = product.settings[step] @@ -363,7 +360,12 @@ def _get_multi_model_settings(products, step): return settings, exclude -def _run_preproc_function(function, items, kwargs, input_files=None): +def _run_preproc_function( + function: Callable, + items: Any, + kwargs: Any, + input_files: list[File] | None = None, +) -> Any: """Run preprocessor function.""" kwargs_str = ",\n".join( [f"{k} = {pformat(v)}" for (k, v) in kwargs.items()], @@ -424,13 +426,13 @@ def _run_preproc_function(function, items, kwargs, input_files=None): def preprocess( - items, - step, - input_files=None, - output_file=None, - debug=False, - **settings, -): + items: list[PreprocessorFile | Cube | str | Path], + step: str, + input_files: list[File] | None = None, + output_file: Path | None = None, + debug: bool = False, + **settings: Any, +) -> list[PreprocessorFile | Cube | str | Path]: """Run preprocessor.""" logger.debug("Running preprocessor step %s", step) function = globals()[step] @@ -480,15 +482,18 @@ def preprocess( return items -def get_step_blocks(steps, order): +def get_step_blocks( + steps: Iterable[str], + order: list[str], +) -> list[list[str]]: """Group steps into execution blocks.""" - blocks = [] + blocks: list[list[str]] = [] prev_step_type = None for step in order[len(INITIAL_STEPS) : -len(FINAL_STEPS)]: if step in steps: step_type = step in MULTI_MODEL_FUNCTIONS if step_type is not prev_step_type: - block = [] + block: list[str] = [] blocks.append(block) prev_step_type = step_type block.append(step) @@ -504,7 +509,7 @@ def __init__( attributes: dict[str, Any] | None = None, settings: dict[str, Any] | None = None, datasets: list | None = None, - ): + ) -> None: if datasets is not None: # Load data using a Dataset input_files = [] @@ -519,8 +524,8 @@ def __init__( input_files = [] ancestors = [] - self.datasets = datasets - self._cubes = None + self.datasets: list[Dataset] | None = datasets + self._cubes: CubeList | None = None self._input_files = input_files # Set some preprocessor settings (move all defaults here?) @@ -542,17 +547,15 @@ def __init__( ancestors=ancestors, ) - def check(self): + def check(self) -> None: """Check preprocessor settings.""" check_preprocessor_settings(self.settings) - def apply(self, step: str, debug: bool = False): + def apply(self, step: str, debug: bool = False) -> None: """Apply preprocessor step to product.""" if step not in self.settings: msg = f"PreprocessorFile {self} has no settings for step {step}" - raise ValueError( - msg, - ) + raise ValueError(msg) self.cubes = preprocess( self.cubes, step, @@ -563,20 +566,20 @@ def apply(self, step: str, debug: bool = False): ) @property - def cubes(self): + def cubes(self) -> CubeList: """Cubes.""" if self._cubes is None: - self._cubes = [ds.load() for ds in self.datasets] + self._cubes = [ds.load() for ds in self.datasets] # type: ignore return self._cubes @cubes.setter - def cubes(self, value): + def cubes(self, value: CubeList) -> None: self._cubes = value def save(self) -> Delayed | None: """Save cubes to disk.""" return preprocess( - self._cubes, + self._cubes, # type: ignore "save", input_files=self._input_files, **self.settings["save"], @@ -592,7 +595,7 @@ def close(self) -> Delayed | None: self.save_provenance() return result - def _update_attributes(self): + def _update_attributes(self) -> None: """Update product attributes from cube metadata.""" if not self._cubes: return @@ -616,11 +619,11 @@ def _update_attributes(self): self.attributes["frequency"] = ref_cube.attributes["frequency"] @property - def is_closed(self): + def is_closed(self) -> bool: """Check if the file is closed.""" return self._cubes is None - def _initialize_entity(self): + def _initialize_entity(self) -> None: """Initialize the provenance entity representing the file.""" super()._initialize_entity() settings = { @@ -651,7 +654,11 @@ def group(self, keys: list) -> str: return "_".join(identifier) -def _apply_multimodel(products, step, debug): +def _apply_multimodel( + products: set[PreprocessorFile], + step: str, + debug: bool | None, +) -> set[PreprocessorFile]: """Apply multi model step to products.""" settings, exclude = _get_multi_model_settings(products, step) @@ -660,7 +667,11 @@ def _apply_multimodel(products, step, debug): step, "\n".join(str(p) for p in products - exclude), ) - result = preprocess(products - exclude, step, **settings) + result: list[PreprocessorFile] = preprocess( # type: ignore + products - exclude, # type: ignore + step, + **settings, + ) products = set(result) | exclude if debug: @@ -683,7 +694,7 @@ def __init__( order: Iterable[str] = DEFAULT_ORDER, debug: bool | None = None, write_ncl_interface: bool = False, - ): + ) -> None: """Initialize.""" _check_multi_model_settings(products) super().__init__(name=name, products=products) @@ -691,13 +702,13 @@ def __init__( self.debug = debug self.write_ncl_interface = write_ncl_interface - def _initialize_product_provenance(self): + def _initialize_product_provenance(self) -> None: """Initialize product provenance.""" self._initialize_products(self.products) self._initialize_multimodel_provenance() self._initialize_ensemble_provenance() - def _initialize_multiproduct_provenance(self, step): + def _initialize_multiproduct_provenance(self, step: str) -> None: input_products = self._get_input_products(step) if input_products: statistic_products = set() @@ -711,23 +722,23 @@ def _initialize_multiproduct_provenance(self, step): self._initialize_products(statistic_products) - def _initialize_multimodel_provenance(self): + def _initialize_multimodel_provenance(self) -> None: """Initialize provenance for multi-model statistics.""" step = "multi_model_statistics" self._initialize_multiproduct_provenance(step) - def _initialize_ensemble_provenance(self): + def _initialize_ensemble_provenance(self) -> None: """Initialize provenance for ensemble statistics.""" step = "ensemble_statistics" self._initialize_multiproduct_provenance(step) - def _get_input_products(self, step): + def _get_input_products(self, step: str) -> list[PreprocessorFile]: """Get input products.""" return [ product for product in self.products if step in product.settings ] - def _initialize_products(self, products): + def _initialize_products(self, products: set[PreprocessorFile]) -> None: """Initialize products.""" for product in products: product.initialize_provenance(self.activity) @@ -792,7 +803,7 @@ def _run(self, _) -> list[str]: # noqa: C901,PLR0912 self.write_ncl_interface, ) - def __str__(self): + def __str__(self) -> str: """Get human readable description.""" order = [ step From 19dbff94c72349a31492b350fe68bd9c21c97e02 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 17:45:39 +0200 Subject: [PATCH 11/85] Add type hints to compare_with_refs.py --- esmvalcore/preprocessor/_compare_with_refs.py | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/esmvalcore/preprocessor/_compare_with_refs.py b/esmvalcore/preprocessor/_compare_with_refs.py index e29cda36d0..d60f700d0b 100644 --- a/esmvalcore/preprocessor/_compare_with_refs.py +++ b/esmvalcore/preprocessor/_compare_with_refs.py @@ -119,9 +119,7 @@ def bias( "A list of Cubes is given to this preprocessor; please " "specify a `reference`" ) - raise ValueError( - msg, - ) + raise ValueError(msg) (reference, ref_product) = _get_ref(products, "reference_for_bias") else: ref_product = None @@ -178,9 +176,7 @@ def _get_ref(products, ref_tag: str) -> tuple[Cube, PreprocessorFile]: f"Expected exactly 1 dataset with '{ref_tag}: true', found " f"{len(ref_products):d}" ) - raise ValueError( - msg, - ) + raise ValueError(msg) ref_product = ref_products[0] # Extract reference cube @@ -210,9 +206,7 @@ def _calculate_bias(cube: Cube, reference: Cube, bias_type: BiasType) -> Cube: f"Expected one of ['absolute', 'relative'] for bias_type, got " f"'{bias_type}'" ) - raise ValueError( - msg, - ) + raise ValueError(msg) cube.metadata = cube_metadata cube.units = new_units @@ -345,9 +339,7 @@ def distance_metric( "A list of Cubes is given to this preprocessor; please " "specify a `reference`" ) - raise ValueError( - msg, - ) + raise ValueError(msg) reference, reference_product = _get_ref( products, "reference_for_metric", @@ -403,18 +395,14 @@ def _calculate_metric( f"distance metric calculation, got {cube.shape} and " f"{reference.shape}, respectively" ) - raise ValueError( - msg, - ) + raise ValueError(msg) try: cube + reference # dummy operation to check if cubes are compatible except Exception as exc: msg = ( "Cannot calculate distance metric between cube and reference cube " ) - raise ValueError( - msg, - ) from exc + raise ValueError(msg) from exc # Perform the actual calculation of the distance metric # Note: we work on arrays here instead of cube to stay as flexible as @@ -436,9 +424,7 @@ def _calculate_metric( msg = ( f"Expected one of {list(metrics_funcs)} for metric, got '{metric}'" ) - raise ValueError( - msg, - ) + raise ValueError(msg) (res_data, res_metadata) = metrics_funcs[metric](cube, reference, coords) # Get result cube with correct dimensional metadata by using dummy @@ -589,7 +575,11 @@ def _calculate_emd( return (emd, metadata) -def _get_emd(arr, ref_arr, bin_centers): +def _get_emd( + arr: np.ndarray, + ref_arr: np.ndarray, + bin_centers: np.ndarray, +) -> np.ndarray: """Calculate Earth mover's distance (non-lazy).""" if np.ma.is_masked(arr) or np.ma.is_masked(ref_arr): return np.ma.masked # this is safe because PMFs will be masked arrays From d8ea7d9a9be3856b723e4a1502a1bb8960d2b488 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 17:46:13 +0200 Subject: [PATCH 12/85] Add type hints to _derive/__init__.py --- esmvalcore/preprocessor/_derive/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index cbd181d2c8..cd209f88de 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -144,8 +144,6 @@ def derive( f"Units '{cube.units}' after executing derivation script of " f"'{short_name}' cannot be converted to target units '{units}'" ) - raise ValueError( - msg, - ) from exc + raise ValueError(msg) from exc return cube From 367bfe70672a99236bf1b01c5f849b5854275d9d Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 17:49:11 +0200 Subject: [PATCH 13/85] Add type hints to some derive functions --- esmvalcore/preprocessor/_derive/ohc.py | 9 ++++++--- esmvalcore/preprocessor/_derive/vegfrac.py | 8 +++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py index 47aa7b2fc8..87643d2d1b 100644 --- a/esmvalcore/preprocessor/_derive/ohc.py +++ b/esmvalcore/preprocessor/_derive/ohc.py @@ -3,6 +3,9 @@ import iris from cf_units import Unit from iris import Constraint +from iris.cube import Cube, CubeList + +from esmvalcore.typing import Facets, FacetValue from ._baseclass import DerivedVariableBase @@ -13,9 +16,9 @@ class DerivedVariable(DerivedVariableBase): """Derivation of variable `ohc`.""" @staticmethod - def required(project): + def required(project: FacetValue) -> list[Facets]: """Declare the variables needed for derivation.""" - volcello = {"short_name": "volcello", "mip": "fx"} + volcello: Facets = {"short_name": "volcello", "mip": "fx"} if project == "CMIP5": volcello["ensemble"] = "r0i0p0" elif project == "CMIP6": @@ -23,7 +26,7 @@ def required(project): return [{"short_name": "thetao"}, volcello] @staticmethod - def calculate(cubes): + def calculate(cubes: CubeList) -> Cube: """ Compute ocean heat content. diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py index c48e723f42..edd4dce75d 100644 --- a/esmvalcore/preprocessor/_derive/vegfrac.py +++ b/esmvalcore/preprocessor/_derive/vegfrac.py @@ -3,8 +3,10 @@ import dask.array as da import iris from iris import NameConstraint +from iris.cube import Cube, CubeList from esmvalcore.preprocessor._regrid import regrid +from esmvalcore.typing import Facets, FacetValue from ._baseclass import DerivedVariableBase @@ -13,9 +15,9 @@ class DerivedVariable(DerivedVariableBase): """Derivation of variable `vegFrac`.""" @staticmethod - def required(project): + def required(project: FacetValue) -> list[Facets]: """Declare the variables needed for derivation.""" - sftlf = {"short_name": "sftlf", "mip": "fx"} + sftlf: Facets = {"short_name": "sftlf", "mip": "fx"} if project == "CMIP5": sftlf["ensemble"] = "r0i0p0" return [ @@ -25,7 +27,7 @@ def required(project): ] @staticmethod - def calculate(cubes): + def calculate(cubes: CubeList) -> Cube: """Compute vegetation fraction from bare soil fraction.""" baresoilfrac_cube = cubes.extract_cube( NameConstraint(var_name="baresoilFrac"), From 5bbe6ce4b5a6f1584435da339c56840471b9d239 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 18:16:57 +0200 Subject: [PATCH 14/85] Add type hints to _regrid.py --- esmvalcore/preprocessor/_regrid.py | 216 ++++++++++++++--------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py index 0489ad881f..62b1b27e83 100644 --- a/esmvalcore/preprocessor/_regrid.py +++ b/esmvalcore/preprocessor/_regrid.py @@ -50,6 +50,9 @@ if TYPE_CHECKING: from collections.abc import Iterable + from iris.coords import Coord + from numpy.typing import ArrayLike + from esmvalcore.dataset import Dataset logger = logging.getLogger(__name__) @@ -117,7 +120,7 @@ } # Supported vertical interpolation schemes. -VERTICAL_SCHEMES = ( +VERTICAL_SCHEMES: tuple[str, ...] = ( "linear", "nearest", "linear_extrapolate", @@ -125,12 +128,12 @@ ) -def parse_cell_spec(spec): +def parse_cell_spec(spec: str) -> tuple[float, float]: """Parse an MxN cell specification string. Parameters ---------- - spec: str + spec: ``MxN`` degree cell-specification for the global grid. Returns @@ -171,16 +174,20 @@ def parse_cell_spec(spec): return dlon, dlat -def _generate_cube_from_dimcoords(latdata, londata, circular: bool = False): +def _generate_cube_from_dimcoords( + latdata: ArrayLike, + londata: ArrayLike, + circular: bool = False, +) -> Cube: """Generate cube from lat/lon points. Parameters ---------- - latdata : np.ndarray + latdata: List of latitudes. - londata : np.ndarray + londata: List of longitudes. - circular : bool + circular Wrap longitudes around the full great circle. Bounds will not be generated for circular coordinates. @@ -217,7 +224,11 @@ def _generate_cube_from_dimcoords(latdata, londata, circular: bool = False): @functools.lru_cache -def _global_stock_cube(spec, lat_offset=True, lon_offset=True): +def _global_stock_cube( + spec: str, + lat_offset: bool = True, + lon_offset: bool = True, +) -> Cube: """Create a stock cube. Create a global cube with M degree-east by N degree-north regular grid @@ -229,13 +240,13 @@ def _global_stock_cube(spec, lat_offset=True, lon_offset=True): Parameters ---------- - spec : str + spec Specifies the 'MxN' degree cell-specification for the global grid. - lat_offset : bool + lat_offset Offset the grid centers of the latitude coordinate w.r.t. the pole by half a grid step. This argument is ignored if `target_grid` is a cube or file. - lon_offset : bool + lon_offset Offset the grid centers of the longitude coordinate w.r.t. Greenwich meridian by half a grid step. This argument is ignored if `target_grid` is a cube or file. @@ -282,7 +293,7 @@ def _spec_to_latlonvals( start_longitude: float, end_longitude: float, step_longitude: float, -) -> tuple: +) -> tuple[np.ndarray, np.ndarray]: """Define lat/lon values from spec. Create a regional cube starting defined by the target specification. @@ -292,54 +303,48 @@ def _spec_to_latlonvals( Parameters ---------- - start_latitude : float + start_latitude: Latitude value of the first grid cell center (start point). The grid includes this value. - end_latitude : float + end_latitude: Latitude value of the last grid cell center (end point). The grid includes this value only if it falls on a grid point. Otherwise, it cuts off at the previous value. - step_latitude : float + step_latitude: Latitude distance between the centers of two neighbouring cells. - start_longitude : float + start_longitude: Latitude value of the first grid cell center (start point). The grid includes this value. - end_longitude : float + end_longitude: Longitude value of the last grid cell center (end point). The grid includes this value only if it falls on a grid point. Otherwise, it cuts off at the previous value. - step_longitude : float + step_longitude: Longitude distance between the centers of two neighbouring cells. Returns ------- - xvals : np.array + xvals: np.array List of longitudes - yvals : np.array + yvals: np.array List of latitudes """ if step_latitude == 0: msg = f"Latitude step cannot be 0, got step_latitude={step_latitude}." - raise ValueError( - msg, - ) + raise ValueError(msg) if step_longitude == 0: msg = ( f"Longitude step cannot be 0, got step_longitude={step_longitude}." ) - raise ValueError( - msg, - ) + raise ValueError(msg) if (start_latitude < _LAT_MIN) or (end_latitude > _LAT_MAX): msg = ( f"Latitude values must lie between {_LAT_MIN}:{_LAT_MAX}, " f"got start_latitude={start_latitude}:end_latitude={end_latitude}." ) - raise ValueError( - msg, - ) + raise ValueError(msg) def get_points(start, stop, step): """Calculate grid points.""" @@ -354,7 +359,7 @@ def get_points(start, stop, step): return latitudes, longitudes -def _regional_stock_cube(spec: dict): +def _regional_stock_cube(spec: dict[str, Any]) -> Cube: """Create a regional stock cube. Returns @@ -369,7 +374,7 @@ def _regional_stock_cube(spec: dict): circular=True, ) - def add_bounds_from_step(coord, step): + def add_bounds_from_step(coord: Coord, step: float) -> np.ndarray: """Calculate bounds from the given step.""" bound = step / 2 points = coord.points @@ -381,7 +386,7 @@ def add_bounds_from_step(coord, step): return cube -def extract_location(cube, location, scheme): +def extract_location(cube: Cube, location: str, scheme: str) -> Cube: """Extract a point using a location name, with interpolation. Extracts a single location point from a cube, according @@ -399,20 +404,19 @@ def extract_location(cube, location, scheme): Parameters ---------- - cube : cube + cube: The source cube to extract a point from. - - location : str + location: The reference location. Examples: 'mount everest', 'romania','new york, usa' - - scheme : str + scheme: The interpolation scheme. 'linear' or 'nearest'. No default. Returns ------- - Returns a cube with the extracted point, and with adjusted - latitude and longitude coordinates. + iris.cube.Cube + Returns a cube with the extracted point, and with adjusted latitude and + longitude coordinates. Raises ------ @@ -429,17 +433,13 @@ def extract_location(cube, location, scheme): " Examples: 'mount everest', 'romania'," " 'new york, usa'" ) - raise ValueError( - msg, - ) + raise ValueError(msg) if scheme is None: msg = ( "Interpolation scheme needs to be specified." " Use either 'linear' or 'nearest'." ) - raise ValueError( - msg, - ) + raise ValueError(msg) try: # Try to use the default SSL context, see # https://github.com/ESMValGroup/ESMValCore/issues/2012 for more @@ -473,7 +473,12 @@ def extract_location(cube, location, scheme): ) -def extract_point(cube, latitude, longitude, scheme): +def extract_point( + cube: Cube, + latitude: ArrayLike, + longitude: ArrayLike, + scheme: str, +) -> Cube: """Extract a point, with interpolation. Extracts a single latitude/longitude point from a cube, according @@ -493,13 +498,13 @@ def extract_point(cube, latitude, longitude, scheme): Parameters ---------- - cube : cube + cube: The source cube to extract a point from. - - latitude, longitude : float, or array of float - The latitude and longitude of the point. - - scheme : str + latitude: + The latitude of the point. + longitude: + The longitude of the point. + scheme: The interpolation scheme. 'linear' or 'nearest'. No default. Returns @@ -544,15 +549,15 @@ def extract_point(cube, latitude, longitude, scheme): array([ 1, 5, 17, 21, 33, 37, 49, 53]) """ msg = f"Unknown interpolation scheme, got {scheme!r}." - scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower()) - if not scheme: + loaded_scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower()) + if not loaded_scheme: raise ValueError(msg) point = [("latitude", latitude), ("longitude", longitude)] - return cube.interpolate(point, scheme=scheme) + return cube.interpolate(point, scheme=loaded_scheme) -def is_dataset(dataset): +def is_dataset(dataset: Any) -> bool: """Test if something is an `esmvalcore.dataset.Dataset`.""" # Use this function to avoid circular imports return hasattr(dataset, "facets") @@ -624,9 +629,7 @@ def _load_scheme(src_cube: Cube, tgt_cube: Cube, scheme: str | dict): f"Regridding scheme '{scheme}' not available for {grid_type} " f"data, expected one of: {', '.join(schemes)}" ) - raise ValueError( - msg, - ) + raise ValueError(msg) loaded_scheme = schemes[scheme] logger.debug("Loaded regridding scheme %s", loaded_scheme) @@ -642,9 +645,7 @@ def _load_generic_scheme(scheme: dict): object_ref = scheme.pop("reference") except KeyError as key_err: msg = "No reference specified for generic regridding." - raise ValueError( - msg, - ) from key_err + raise ValueError(msg) from key_err module_name, separator, scheme_name = object_ref.partition(":") try: obj: Any = importlib.import_module(module_name) @@ -654,9 +655,7 @@ def _load_generic_scheme(scheme: dict): f"'{module_name}'. Please double check spelling and that the " f"required module is installed." ) - raise ValueError( - msg, - ) from import_err + raise ValueError(msg) from import_err if separator: for attr in scheme_name.split("."): obj = getattr(obj, attr) @@ -720,7 +719,7 @@ def _get_regridder( return regridder -def _get_coord_key(src_cube: Cube, tgt_cube: Cube) -> tuple: +def _get_coord_key(src_cube: Cube, tgt_cube: Cube) -> tuple[ArrayLike, ...]: """Get dict key from coordinates.""" src_lat = src_cube.coord("latitude") src_lon = src_cube.coord("longitude") @@ -733,7 +732,7 @@ def _get_name_and_shape_key( src_cube: Cube, tgt_cube: Cube, scheme: str | dict, -) -> tuple: +) -> tuple[str, tuple[int, ...]]: """Get dict key from scheme name and coordinate shapes.""" name = str(scheme) shapes = [c.shape for c in _get_coord_key(src_cube, tgt_cube)] @@ -984,7 +983,12 @@ def _horizontal_grid_is_close(cube1: Cube, cube2: Cube) -> bool: return True -def _create_cube(src_cube, data, src_levels, levels): +def _create_cube( + src_cube: Cube, + data: ArrayLike, + src_levels: ArrayLike, + levels: ArrayLike, +) -> Cube: """Generate a new cube with the interpolated data. The resultant cube is seeded with `src_cube` metadata and coordinates, @@ -995,14 +999,14 @@ def _create_cube(src_cube, data, src_levels, levels): Parameters ---------- - src_cube : cube + src_cube The source cube that was vertically interpolated. - data : array + data The payload resulting from interpolating the source cube over the specified levels. - src_levels : array + src_levels Vertical levels of the source data - levels : array + levels The vertical levels of interpolation. Returns @@ -1074,7 +1078,7 @@ def _create_cube(src_cube, data, src_levels, levels): # Collapse the z-dimension for the scalar case. if levels.size == 1: - slicer = [slice(None)] * result.ndim + slicer: list[slice | int] = [slice(None)] * result.ndim slicer[z_dim] = 0 result = result[tuple(slicer)] @@ -1082,12 +1086,12 @@ def _create_cube(src_cube, data, src_levels, levels): def _vertical_interpolate( - cube, - src_levels, - levels, - interpolation, - extrapolation, -): + cube: Cube, + src_levels: ArrayLike, + levels: ArrayLike, + interpolation: str, + extrapolation: str, +) -> Cube: """Perform vertical interpolation.""" # Determine the source levels and axis for vertical interpolation. (z_axis,) = cube.coord_dims(cube.coord(axis="z", dim_coords=True)) @@ -1177,12 +1181,12 @@ def _preserve_fx_vars(cube: iris.cube.Cube, result: iris.cube.Cube) -> None: add_ancillary_variable(result, ancillary_cube) -def parse_vertical_scheme(scheme): +def parse_vertical_scheme(scheme: str) -> tuple[str, str]: """Parse the scheme provided for level extraction. Parameters ---------- - scheme : str + scheme: The vertical interpolation scheme to use. Choose from 'linear', 'nearest', @@ -1191,7 +1195,7 @@ def parse_vertical_scheme(scheme): Returns ------- - (str, str) + tuple[str, str] A tuple containing the interpolation and extrapolation scheme. """ # Check if valid scheme is given @@ -1200,9 +1204,7 @@ def parse_vertical_scheme(scheme): f"Unknown vertical interpolation scheme, got '{scheme}', possible " f"schemes are {VERTICAL_SCHEMES}" ) - raise ValueError( - msg, - ) + raise ValueError(msg) # This allows us to put level 0. to load the ocean surface. extrap_scheme = "nan" @@ -1226,7 +1228,7 @@ def extract_levels( coordinate: str | None = None, rtol: float = 1e-7, atol: float | None = None, -): +) -> Cube: """Perform vertical interpolation. Parameters @@ -1344,19 +1346,19 @@ def extract_levels( return result -def get_cmor_levels(cmor_table, coordinate): +def get_cmor_levels(cmor_table: str, coordinate: str) -> list[float]: """Get level definition from a CMOR coordinate. Parameters ---------- - cmor_table: str + cmor_table: CMOR table name - coordinate: str + coordinate: CMOR coordinate name Returns ------- - list[int] + list[float] Raises ------ @@ -1366,15 +1368,11 @@ def get_cmor_levels(cmor_table, coordinate): """ if cmor_table not in CMOR_TABLES: msg = f"Level definition cmor_table '{cmor_table}' not available" - raise ValueError( - msg, - ) + raise ValueError(msg) if coordinate not in CMOR_TABLES[cmor_table].coords: msg = f"Coordinate {coordinate} not available for {cmor_table}" - raise ValueError( - msg, - ) + raise ValueError(msg) cmor = CMOR_TABLES[cmor_table].coords[coordinate] @@ -1387,17 +1385,15 @@ def get_cmor_levels(cmor_table, coordinate): f"Coordinate {coordinate} in {cmor_table} does not have requested " f"values" ) - raise ValueError( - msg, - ) + raise ValueError(msg) -def get_reference_levels(dataset): +def get_reference_levels(dataset: Dataset) -> list[float]: """Get level definition from a reference dataset. Parameters ---------- - dataset: esmvalcore.dataset.Dataset + dataset: Dataset containing the reference files. Returns @@ -1423,7 +1419,11 @@ def get_reference_levels(dataset): @preserve_float_dtype -def extract_coordinate_points(cube, definition, scheme): +def extract_coordinate_points( + cube: Cube, + definition: dict[str, ArrayLike], + scheme: str, +) -> Cube: """Extract points from any coordinate with interpolation. Multiple points can also be extracted, by supplying an array of @@ -1434,11 +1434,11 @@ def extract_coordinate_points(cube, definition, scheme): Parameters ---------- - cube : cube + cube: The source cube to extract a point from. - definition : dict(str, float or array of float) + definition: The coordinate - values pairs to extract - scheme : str + scheme: The interpolation scheme. 'linear' or 'nearest'. No default. Returns @@ -1455,7 +1455,7 @@ def extract_coordinate_points(cube, definition, scheme): If the interpolation scheme is not provided or is not recognised. """ msg = f"Unknown interpolation scheme, got {scheme!r}." - scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower()) - if not scheme: + loaded_scheme = POINT_INTERPOLATION_SCHEMES.get(scheme.lower()) + if not loaded_scheme: raise ValueError(msg) - return cube.interpolate(definition.items(), scheme=scheme) + return cube.interpolate(definition.items(), scheme=loaded_scheme) From d10de1edcf2f4b3eab722cdda9f8283af00a4944 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 18:18:01 +0200 Subject: [PATCH 15/85] Make new dataset methods private --- esmvalcore/dataset.py | 10 +++++----- tests/unit/test_dataset.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index d471dde5d8..1391916637 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -136,7 +136,7 @@ def __init__(self, **facets: FacetValue) -> None: for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) - if not self.is_derived() and self.facets.get( + if not self._is_derived() and self.facets.get( "force_derivation", False, ): @@ -174,14 +174,14 @@ def from_recipe( return datasets_from_recipe(recipe, session) - def is_derived(self) -> bool: + def _is_derived(self) -> bool: """Return ``True`` for derived variables, ``False`` otherwise.""" return bool(self.facets.get("derive", False)) - def derivation_necessary(self) -> bool: + def _derivation_necessary(self) -> bool: """Return ``True`` if derivation is necessary, ``False`` otherwise.""" # If variable cannot be derived, derivation is not necessary - if not self.is_derived(): + if not self._is_derived(): return False # If forced derivation is requested, derivation is necessary @@ -651,7 +651,7 @@ def add_supplementary(self, **facets: FacetValue) -> None: **facets Facets describing the supplementary variable. """ - if self.is_derived(): + if self._is_derived(): facets.setdefault("derive", False) if self.facets.get("force_derivation", False): facets.setdefault("force_derivation", False) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 9624f697f6..5ae4b6d520 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2147,7 +2147,7 @@ def test_derivation_necessary_no_derivation(): type="sat", timerange="1980/2000", ) - assert not dataset.derivation_necessary() + assert not dataset._derivation_necessary() def test_derivation_necessary_no_force_derivation_no_files(): @@ -2161,7 +2161,7 @@ def test_derivation_necessary_no_force_derivation_no_files(): timerange="1980/2000", derive=True, ) - assert dataset.derivation_necessary() + assert dataset._derivation_necessary() def test_derivation_necessary_no_force_derivation(tmp_path, session): @@ -2182,7 +2182,7 @@ def test_derivation_necessary_no_force_derivation(tmp_path, session): input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", ) asr_file.touch() - assert not dataset.derivation_necessary() + assert not dataset._derivation_necessary() def test_derivation_necessary_force_derivation(tmp_path, session): @@ -2204,7 +2204,7 @@ def test_derivation_necessary_force_derivation(tmp_path, session): input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", ) asr_file.touch() - assert dataset.derivation_necessary() + assert dataset._derivation_necessary() def test_force_derivation_no_derived(): From 732386623abd9f38577f977255a02226300f071f Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 18:27:31 +0200 Subject: [PATCH 16/85] Small fix --- esmvalcore/_recipe/recipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index e14d9201e2..e34c7fc52b 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -97,9 +97,9 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str: ) ) raise RecipeError(msg) - dataset_name = str(facets[special_name]) + special_name = str(facets[special_name]) - return dataset_name + return special_name def _update_target_levels( From 3ab2cdfa1ee7e8f7177bce35643983ac92264697 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 18:36:54 +0200 Subject: [PATCH 17/85] Fix test --- tests/integration/preprocessor/_derive/test_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/preprocessor/_derive/test_interface.py b/tests/integration/preprocessor/_derive/test_interface.py index 4c11466fcf..8491c0eb2c 100644 --- a/tests/integration/preprocessor/_derive/test_interface.py +++ b/tests/integration/preprocessor/_derive/test_interface.py @@ -202,7 +202,7 @@ def test_get_required_with_fx(): reference = [ {"short_name": "thetao"}, - {"short_name": "volcello", "mip": "fx"}, + {"short_name": "volcello", "mip": "fx", "ensemble": "r0i0p0"}, ] assert variables == reference From 099349f5575177eee6220525af844791b5fe1504 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 18:39:02 +0200 Subject: [PATCH 18/85] Fix mock --- tests/integration/preprocessor/_derive/test_interface.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/preprocessor/_derive/test_interface.py b/tests/integration/preprocessor/_derive/test_interface.py index 8491c0eb2c..502ef3ba75 100644 --- a/tests/integration/preprocessor/_derive/test_interface.py +++ b/tests/integration/preprocessor/_derive/test_interface.py @@ -22,7 +22,10 @@ def mock_cubes(): @pytest.fixture def patched_derive(mocker): """Fixture for mocked derivation scripts.""" - mocker.patch("iris.cube.CubeList", side_effect=lambda x: x) + mocker.patch( + "esmvalcore.preprocessor._derive.CubeList", + side_effect=lambda x: x, + ) mocker.patch.object(_derive, "ALL_DERIVED_VARIABLES", autospec=True) mocker.patch.object(_derive, "logger", autospec=True) From 86b308b58b798ea0fe4ab13a29060b738e3f9a38 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Sun, 13 Jul 2025 18:47:48 +0200 Subject: [PATCH 19/85] 100% test coverage --- .../unit/preprocessor/_derive/test_vegfrac.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/unit/preprocessor/_derive/test_vegfrac.py diff --git a/tests/unit/preprocessor/_derive/test_vegfrac.py b/tests/unit/preprocessor/_derive/test_vegfrac.py new file mode 100644 index 0000000000..6c86037e23 --- /dev/null +++ b/tests/unit/preprocessor/_derive/test_vegfrac.py @@ -0,0 +1,23 @@ +"""Test derivation of `vegfrac`.""" + +from esmvalcore.preprocessor._derive import vegfrac + + +def test_vegfrac_required_cmip5(): + derived_var = vegfrac.DerivedVariable() + output = derived_var.required("CMIP5") + assert output == [ + {"short_name": "baresoilFrac"}, + {"short_name": "residualFrac"}, + {"short_name": "sftlf", "mip": "fx", "ensemble": "r0i0p0"}, + ] + + +def test_vegfrac_required_cmip6(): + derived_var = vegfrac.DerivedVariable() + output = derived_var.required("CMIP6") + assert output == [ + {"short_name": "baresoilFrac"}, + {"short_name": "residualFrac"}, + {"short_name": "sftlf", "mip": "fx"}, + ] From 369a8114699c6d06bb731a060025bedff7ea34cd Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 14 Jul 2025 09:11:57 +0200 Subject: [PATCH 20/85] Clean doc --- doc/develop/derivation.rst | 34 ++++++++++++++++++++++++---------- doc/quickstart/configure.rst | 6 +++--- doc/recipe/preprocessor.rst | 8 +------- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/doc/develop/derivation.rst b/doc/develop/derivation.rst index 9d097ff843..c8516d8414 100644 --- a/doc/develop/derivation.rst +++ b/doc/develop/derivation.rst @@ -14,6 +14,11 @@ A typical example looks like this: .. code-block:: py """Derivation of variable `dummy`.""" + + from iris.cube import Cube, CubeList + + from esmvalcore.typing import Facets, FacetValue + from ._baseclass import DerivedVariableBase @@ -21,19 +26,19 @@ A typical example looks like this: """Derivation of variable `dummy`.""" @staticmethod - def required(project): + def required(project: FacetValue) -> list[Facets]: """Declare the variables needed for derivation.""" - mip = 'fx' - if project == 'CMIP6': - mip = 'Ofx' + mip = "fx" + if project == "CMIP6": + mip = "Ofx" required = [ - {'short_name': 'var_a'}, - {'short_name': 'var_b', 'mip': mip, 'optional': True}, + {"short_name": "var_a"}, + {"short_name": "var_b", "mip": mip, "optional": True}, ] return required @staticmethod - def calculate(cubes): + def calculate(cubes: CubeList) -> Cube: """Compute `dummy`.""" # `cubes` is a CubeList containing all required variables. @@ -42,7 +47,7 @@ A typical example looks like this: # Return single cube at the end return cube -The static function ``required(project)`` returns a ``list`` of ``dict`` +The static function ``required(project)`` returns a :obj:`list` of :obj:`~esmvalcore.typing.Facets` containing all required variables for deriving the derived variable. Its only argument is the ``project`` of the specific dataset. In this particular example script, the derived variable ``dummy`` is derived from ``var_a`` and @@ -56,5 +61,14 @@ Otherwise, the tool will fail if not all required variables are available for all datasets. The actual derivation takes place in the static function ``calculate(cubes)`` -which returns a single ``cube`` containing the derived variable. Its only -argument ``cubes`` is a ``CubeList`` containing all required variables. +which returns a single :class:`~iris.cube.Cube` containing the derived +variable. Its only argument ``cubes`` is a :class:`~iris.cube.CubeList` +containing all required variables. + +If no MIP table entry for the derived variable exists for the given ``mip``, +the tool will also look in other ``mip`` tables for the same ``project`` to find +the definition of derived variables. To contribute a completely new derived +variable, it is necessary to define a name for it and to provide the +corresponding CMOR table. This is to guarantee the proper metadata definition +is attached to the derived data. Such custom CMOR tables are collected as part +of the `ESMValCore package `_. diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst index 7a78f850b3..d20d67c8e2 100644 --- a/doc/quickstart/configure.rst +++ b/doc/quickstart/configure.rst @@ -1011,7 +1011,7 @@ related to CMOR table settings available: from the ``esmvalcore/cmor/tables/custom`` directory) and it is possible to use variables with a ``mip`` which is different from the MIP table in which they are defined. Note that this option is always enabled for - :ref:`derived ` variables. + :ref:`derived variables `. * ``cmor_path``: path to the CMOR table. Relative paths are with respect to `esmvalcore/cmor/tables`_. Defaults to the value provided in ``cmor_type`` written in lower case. @@ -1026,8 +1026,8 @@ Custom CMOR tables As mentioned in the previous section, the CMOR tables of projects that use ``cmor_strict: false`` will be extended with custom CMOR tables. -For derived variables (the ones with ``derive: true`` in the recipe), the -custom CMOR tables will always be considered. +For :ref:`derived variables ` (the ones with ``derive: +true`` in the recipe), the custom CMOR tables will always be considered. By default, these custom tables are loaded from `esmvalcore/cmor/tables/custom `_. However, by using the special project ``custom`` in the diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst index bedec06cbe..a1d603f289 100644 --- a/doc/recipe/preprocessor.rst +++ b/doc/recipe/preprocessor.rst @@ -196,17 +196,11 @@ case of this operation is the evaluation of a variable which is only available in an observational dataset but not in the models. In this case a derivation function is provided by the ESMValCore in order to calculate the variable and perform the comparison. For example, several observational datasets deliver -total column ozone as observed variable (`toz`), but CMIP models only provide +total column ozone as observed variable (``toz``), but CMIP models only provide the ozone 3D field. In this case, a derivation function is provided to vertically integrate the ozone and obtain total column ozone for direct comparison with the observations. -The tool will also look in other ``mip`` tables for the same ``project`` to find -the definition of derived variables. To contribute a completely new derived -variable, it is necessary to define a name for it and to provide the -corresponding CMOR table. This is to guarantee the proper metadata definition -is attached to the derived data. Such custom CMOR tables are collected as part -of the `ESMValCore package `_. By default, the variable derivation will be applied only if the variable is not already available in the input data, but the derivation can be forced by setting the ``force_derivation`` flag. From c2a3d81e4b7a3ba86bc27d1936d37eebbcaae53d Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 14 Jul 2025 10:12:43 +0200 Subject: [PATCH 21/85] 100% diff coverage --- .../preprocessor/_derive/test_interface.py | 6 ++ tests/integration/recipe/test_recipe.py | 86 +++++++++++++++++++ tests/integration/test_local.py | 16 +++- .../_multimodel/test_multimodel.py | 33 ++++++- 4 files changed, 138 insertions(+), 3 deletions(-) diff --git a/tests/integration/preprocessor/_derive/test_interface.py b/tests/integration/preprocessor/_derive/test_interface.py index 502ef3ba75..28f41f8693 100644 --- a/tests/integration/preprocessor/_derive/test_interface.py +++ b/tests/integration/preprocessor/_derive/test_interface.py @@ -211,6 +211,12 @@ def test_get_required_with_fx(): assert variables == reference +def test_get_required_invalid_var(): + msg = r"Cannot derive variable '_invalid_var_'" + with pytest.raises(NotImplementedError, match=msg): + get_required("_invalid_var_", "CMIP5") + + def test_derive_nonstandard_nofx(): """Test a specific derivation.""" short_name = "alb" diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index f45c547884..0ab6e12637 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -1,3 +1,4 @@ +import inspect import os import re from collections import defaultdict @@ -1542,6 +1543,42 @@ def test_diagnostic_task_provenance( assert os.path.exists(prefix + ".xml") +def test_invalid_diagnostcic_ancestor( + tmp_path, + patched_datafinder, + session, +): + content = dedent( + """ + diagnostics: + diagnostic_name: + themes: + - phys + realms: + - atmos + variables: + tas: + project: CMIP5 + mip: Amon + exp: historical + timerange: 2000/2005 + ensemble: r1i1p1 + additional_datasets: + - dataset: CanESM2 + scripts: + script_name: + script: examples/diagnostic.py + script_name2: + script: examples/diagnostic.py + ancestors: [invalid_*] + """, + ) + + msg = r"Could not find any ancestors matching" + with pytest.raises(RecipeError, match=msg): + get_recipe(tmp_path, content, session) + + def test_alias_generation(tmp_path, patched_datafinder, session): # noqa: C901, PLR0912 content = dedent(""" diagnostics: @@ -2892,6 +2929,55 @@ def test_statistics_missing_operator_no_default_fail( get_recipe(tmp_path, content, session) +def test_check_preprocessor_settings_last_resort( + mocker, + tmp_path, + caplog, + patched_datafinder, + session, +): + # Create mock so that no errors during the regular preprocessor parameter + # checks are raised, but only during the last sanity check + def raise_exc(): + msg = "type error" + raise TypeError(msg) + + mock_args = mocker.Mock(name="args", kind=inspect.Parameter.VAR_POSITIONAL) + mock_bind = mocker.Mock(side_effect=raise_exc) + mock_signature = mocker.Mock( + parameters={"args": mock_args}, + bind=mock_bind, + ) + mocker.patch( + "inspect.signature", + autospec=True, + return_value=mock_signature, + ) + content = dedent(""" + diagnostics: + diagnostic_name: + variables: + chl_default: + short_name: chl + mip: Oyr + timerange: '2000/2010' + additional_datasets: + - project: CMIP5 + dataset: CanESM2 + exp: historical + ensemble: r1i1p1 + scripts: null + """) + with pytest.raises(TypeError): + get_recipe(tmp_path, content, session) + log_errors = [r.message for r in caplog.records if r.levelname == "ERROR"] + msg = ( + "Wrong preprocessor function arguments in function " + "'remove_supplementary_variables'" + ) + assert msg in log_errors + + @pytest.mark.parametrize( ("preproc", "option"), [ diff --git a/tests/integration/test_local.py b/tests/integration/test_local.py index 839c0d159b..e2dae85dff 100644 --- a/tests/integration/test_local.py +++ b/tests/integration/test_local.py @@ -8,7 +8,12 @@ import yaml from esmvalcore.config import CFG -from esmvalcore.local import LocalFile, _get_output_file, find_files +from esmvalcore.local import ( + LocalFile, + _get_output_file, + _select_drs, + find_files, +) # Load test configuration with open( @@ -124,3 +129,12 @@ def test_find_files_with_facets(monkeypatch, root): assert sorted([Path(f) for f in input_filelist]) == sorted(ref_files) assert isinstance(input_filelist[0], LocalFile) assert input_filelist[0].facets + + +def test_select_invalid_drs_structure(): + msg = ( + r"drs _INVALID_STRUCTURE_ for CMIP6 project not specified in " + r"config-developer file" + ) + with pytest.raises(KeyError, match=msg): + _select_drs("input_dir", "CMIP6", "_INVALID_STRUCTURE_") diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py index 1fd510eb50..7e497f4a1c 100644 --- a/tests/unit/preprocessor/_multimodel/test_multimodel.py +++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py @@ -15,7 +15,10 @@ import esmvalcore.preprocessor._multimodel as mm from esmvalcore.iris_helpers import date2num -from esmvalcore.preprocessor import multi_model_statistics +from esmvalcore.preprocessor import ( + _check_multi_model_settings, + multi_model_statistics, +) from esmvalcore.preprocessor._supplementary_vars import add_ancillary_variable SPAN_OPTIONS = ("overlap", "full") @@ -835,11 +838,21 @@ def test_unify_time_coordinates(): class PreprocessorFile: """Mockup to test output of multimodel.""" - def __init__(self, cube=None, attributes=None): + def __init__( + self, + cube=None, + attributes=None, + filename=None, + settings=None, + ): if cube: self.cubes = [cube] if attributes: self.attributes = attributes + if filename: + self.filename = filename + if settings: + self.settings = settings def wasderivedfrom(self, product): pass @@ -1698,3 +1711,19 @@ def test_get_operator_and_kwargs_operator_missing(statistic): def test_get_stat_identifier(statistic, output): """Test ``_get_stat_identifier``.""" assert mm._get_stat_identifier(statistic) == output + + +def test_differing_multi_model_settings(): + products = [ + PreprocessorFile( + filename="a", + settings={"multi_model_statistics": {"statistics": ["mean"]}}, + ), + PreprocessorFile( + filename="b", + settings={"multi_model_statistics": {"statistics": ["median"]}}, + ), + ] + msg = r"Unable to combine differing multi-dataset settings for a and b" + with pytest.raises(ValueError, match=msg): + _check_multi_model_settings(products) From a3dab123cfdd2f6bc2f04b6eec2e4c5fcac6b9bf Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 14 Jul 2025 10:23:46 +0200 Subject: [PATCH 22/85] Try to please Codacy --- esmvalcore/dataset.py | 4 ++-- esmvalcore/exceptions.py | 4 ++-- esmvalcore/local.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 1391916637..9232f00d6c 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -498,7 +498,7 @@ def __repr__(self) -> str: "short_name", ) - def facets2str(facets): + def facets2str(facets: Facets) -> str: view = {k: facets[k] for k in first_keys if k in facets} for key, value in sorted(facets.items()): if key not in first_keys: @@ -557,7 +557,7 @@ def summary(self, shorten: bool = False) -> str: title = self.__class__.__name__ txt = f"{title}: " + self._get_joined_summary_facets(", ") - def supplementary_summary(dataset): + def supplementary_summary(dataset: Dataset) -> str: return ", ".join( str(dataset.facets[k]) for k in self._SUMMARY_FACETS diff --git a/esmvalcore/exceptions.py b/esmvalcore/exceptions.py index f57dcbaaa2..bdabb4f903 100644 --- a/esmvalcore/exceptions.py +++ b/esmvalcore/exceptions.py @@ -35,10 +35,10 @@ class InvalidConfigParameter(Error, SuppressedError): class RecipeError(Error): """Recipe contains an error.""" - def __init__(self, msg): + def __init__(self, msg: str) -> None: super().__init__(msg) self.message = msg - self.failed_tasks = [] + self.failed_tasks: list[RecipeError] = [] class InputFilesNotFound(RecipeError): diff --git a/esmvalcore/local.py b/esmvalcore/local.py index df0a0df225..9b30df924b 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -325,7 +325,7 @@ def _truncate_dates(date: str, file_date: str) -> tuple[int, int]: def _select_files( filenames: list[LocalFile], - timerange: list, + timerange: FacetValue, ) -> list[LocalFile]: """Select files containing data between a given timerange. @@ -335,6 +335,7 @@ def _select_files( Otherwise, the file selection occurs taking into account the time resolution of the file. """ + timerange = str(timerange) if "*" in timerange: # TODO: support * combined with a period return filenames From 001eafa3ddc02beeb7b2d959e9d3706d95a19f4a Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 14 Jul 2025 10:30:49 +0200 Subject: [PATCH 23/85] Make tests work without ESMValTool installation --- tests/integration/recipe/test_recipe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 0ab6e12637..b127bcc3e7 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -1548,8 +1548,10 @@ def test_invalid_diagnostcic_ancestor( patched_datafinder, session, ): + script = tmp_path / "diagnostic.py" + script.write_text("") content = dedent( - """ + f""" diagnostics: diagnostic_name: themes: @@ -1567,9 +1569,9 @@ def test_invalid_diagnostcic_ancestor( - dataset: CanESM2 scripts: script_name: - script: examples/diagnostic.py + script: {script} script_name2: - script: examples/diagnostic.py + script: {script} ancestors: [invalid_*] """, ) From debd589c229f31ab6a74c17fb71c423d3598a0e5 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 14 Jul 2025 11:03:32 +0200 Subject: [PATCH 24/85] 100% diff coverage for real --- esmvalcore/preprocessor/__init__.py | 2 +- .../unit/preprocessor/test_preprocessor_file.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index bbc8313127..3f275fb439 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -554,7 +554,7 @@ def check(self) -> None: def apply(self, step: str, debug: bool = False) -> None: """Apply preprocessor step to product.""" if step not in self.settings: - msg = f"PreprocessorFile {self} has no settings for step {step}" + msg = f"{self} has no settings for step {step}" raise ValueError(msg) self.cubes = preprocess( self.cubes, diff --git a/tests/unit/preprocessor/test_preprocessor_file.py b/tests/unit/preprocessor/test_preprocessor_file.py index 82fbdf7522..9d9b1f1a60 100644 --- a/tests/unit/preprocessor/test_preprocessor_file.py +++ b/tests/unit/preprocessor/test_preprocessor_file.py @@ -169,3 +169,20 @@ def test_save(mock_preprocess): ), mock.call().__getitem__(0), ] + + +def test_apply_invalid_settings(): + product = PreprocessorFile(filename=Path("test"), settings={}) + msg = r"PreprocessorFile: test has no settings for step invalid_step" + with pytest.raises(ValueError, match=msg): + product.apply("invalid_step") + + +@pytest.mark.parametrize( + ("cubes", "output"), + [(None, True), (CubeList([]), False)], +) +def test_is_closed(cubes, output): + product = PreprocessorFile(filename=Path("test")) + product.cubes = cubes + assert product.is_closed is output From c3df13eb3a5d2e1cb00992aaf6ce4ea967ae08f3 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 09:49:15 +0200 Subject: [PATCH 25/85] Added Dataset.input_datasets --- esmvalcore/dataset.py | 47 ++++++++++ tests/unit/test_dataset.py | 173 ++++++++++++++++++++++--------------- 2 files changed, 149 insertions(+), 71 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 9232f00d6c..90a470a376 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -33,6 +33,7 @@ _get_start_end_date, ) from esmvalcore.preprocessor import preprocess +from esmvalcore.preprocessor._derive import get_required if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Sequence @@ -132,6 +133,7 @@ def __init__(self, **facets: FacetValue) -> None: self._session: Session | None = None self._files: Sequence[File] | None = None self._file_globs: Sequence[Path] | None = None + self._input_datasets: list[Dataset] | None = None for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -194,6 +196,51 @@ def _derivation_necessary(self) -> bool: ds_copy.supplementaries = [] return not ds_copy.files + def _get_input_datasets(self) -> list[Dataset]: + """Get input datasets.""" + input_datasets: list[Dataset] = [] + required_vars_facets = get_required( + self.facets["short_name"], + self.facets["project"], + ) + + for required_facets in required_vars_facets: + input_dataset = self._copy(derive=False, force_derivation=False) + keep = {"alias", "recipe_dataset_index", *self.minimal_facets} + input_dataset.facets = { + k: v for k, v in input_dataset.facets.items() if k in keep + } + input_dataset.facets.update(required_facets) + input_dataset.augment_facets() + input_datasets.append(input_dataset) + + return input_datasets + + @property + def input_datasets(self) -> list[Dataset]: + """Get input datasets. + + For non-derived variables (i.e., those with facet ``derive=False``), + this will simply return the dataset itself in a list. + + For derived variables (i.e., those with facet ``derive=True``), this + will return the datasets required for derivation if derivation is + necessary, and the dataset itself if derivation is not necessary. + Derivation is necessary if the facet ``force_derivation=True`` is set + or no files for the dataset itself are available. + + """ + if self._input_datasets is not None: + return self._input_datasets + + if not self._derivation_necessary(): + input_datasets = [self] + else: + input_datasets = self._get_input_datasets() + + self._input_datasets = input_datasets + return input_datasets + def _file_to_dataset( self, file: esgf.ESGFFile | local.LocalFile, diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 5ae4b6d520..a3c2f35359 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2137,74 +2137,57 @@ def test_get_extra_facets_native6(): } +OBS6_SAT_FACETS = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", +} + + def test_derivation_necessary_no_derivation(): - dataset = Dataset( - project="OBS6", - dataset="SAT", - mip="Amon", - short_name="tas", - tier=2, - type="sat", - timerange="1980/2000", - ) - assert not dataset._derivation_necessary() + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + assert dataset._derivation_necessary() is False def test_derivation_necessary_no_force_derivation_no_files(): - dataset = Dataset( - project="OBS6", - dataset="SAT", - mip="Amon", - short_name="asr", - tier=2, - type="sat", - timerange="1980/2000", - derive=True, - ) - assert dataset._derivation_necessary() + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + assert dataset._derivation_necessary() is True def test_derivation_necessary_no_force_derivation(tmp_path, session): - dataset = Dataset( - project="OBS6", - dataset="SAT", - mip="Amon", - short_name="asr", - tier=2, - type="sat", - timerange="1980/2000", - derive=True, - ) + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.session = session + input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - asr_file = esmvalcore.local.LocalFile( - input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", + lwcre = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - asr_file.touch() - assert not dataset._derivation_necessary() + lwcre.touch() + + assert dataset._derivation_necessary() is False def test_derivation_necessary_force_derivation(tmp_path, session): dataset = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="lwcre", - exp="historical", - grid="gn", - ensemble="r1i1p1f1", derive=True, force_derivation=True, ) dataset.session = session + input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - asr_file = esmvalcore.local.LocalFile( - input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", + lwcre_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - asr_file.touch() - assert dataset._derivation_necessary() + lwcre_file.touch() + + assert dataset._derivation_necessary() is True def test_force_derivation_no_derived(): @@ -2214,19 +2197,11 @@ def test_force_derivation_no_derived(): ) with pytest.raises(ValueError, match=msg): - Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", - short_name="tas", - force_derivation=True, - ) + Dataset(**OBS6_SAT_FACETS, short_name="tas", force_derivation=True) with pytest.raises(ValueError, match=msg): Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="tas", derive=False, force_derivation=True, @@ -2235,21 +2210,17 @@ def test_force_derivation_no_derived(): def test_add_supplementary_to_derived(): dataset = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="lwcre", derive=True, force_derivation=True, ) - dataset.add_supplementary(short_name="areacella", mip="fx") + dataset.add_supplementary(short_name="pr") expected_supplementary = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="fx", - short_name="areacella", + **OBS6_SAT_FACETS, + short_name="pr", derive=False, force_derivation=False, ) @@ -2258,26 +2229,86 @@ def test_add_supplementary_to_derived(): def test_add_derived_supplementary_to_derived(): dataset = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="lwcre", derive=True, force_derivation=True, ) dataset.add_supplementary( - short_name="asr", + short_name="swcre", derive=True, force_derivation=True, ) expected_supplementary = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", - short_name="asr", + **OBS6_SAT_FACETS, + short_name="swcre", derive=True, force_derivation=True, ) assert dataset.supplementaries[0] == expected_supplementary + + +def test_input_datasets_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + + expected_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_dataset in expected_datasets: + expected_dataset.session = dataset.session + + assert dataset.input_datasets == expected_datasets + + +def test_input_datasets_no_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + dataset.add_supplementary(short_name="pr") + + assert dataset.input_datasets == [dataset] + + +def test_input_datasets_no_force_derivation(tmp_path, session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + + assert dataset.input_datasets == [dataset] + + +def test_input_datasets_no_derivation_available(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True) + + msg = r"Cannot derive variable 'tas': no derivation script available" + with pytest.raises(NotImplementedError, match=msg): + dataset.input_datasets # noqa: B018 From e7948173aa83ef1a55256264769855d81bc89f25 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 09:50:24 +0200 Subject: [PATCH 26/85] Shorter code --- tests/unit/test_dataset.py | 109 +++++++++++++------------------------ 1 file changed, 38 insertions(+), 71 deletions(-) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 5ae4b6d520..149bdb11bc 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2137,74 +2137,57 @@ def test_get_extra_facets_native6(): } +OBS6_SAT_FACETS = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", +} + + def test_derivation_necessary_no_derivation(): - dataset = Dataset( - project="OBS6", - dataset="SAT", - mip="Amon", - short_name="tas", - tier=2, - type="sat", - timerange="1980/2000", - ) - assert not dataset._derivation_necessary() + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + assert dataset._derivation_necessary() is False def test_derivation_necessary_no_force_derivation_no_files(): - dataset = Dataset( - project="OBS6", - dataset="SAT", - mip="Amon", - short_name="asr", - tier=2, - type="sat", - timerange="1980/2000", - derive=True, - ) - assert dataset._derivation_necessary() + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + assert dataset._derivation_necessary() is True def test_derivation_necessary_no_force_derivation(tmp_path, session): - dataset = Dataset( - project="OBS6", - dataset="SAT", - mip="Amon", - short_name="asr", - tier=2, - type="sat", - timerange="1980/2000", - derive=True, - ) + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.session = session + input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - asr_file = esmvalcore.local.LocalFile( - input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", + lwcre = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - asr_file.touch() - assert not dataset._derivation_necessary() + lwcre.touch() + + assert dataset._derivation_necessary() is False def test_derivation_necessary_force_derivation(tmp_path, session): dataset = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="lwcre", - exp="historical", - grid="gn", - ensemble="r1i1p1f1", derive=True, force_derivation=True, ) dataset.session = session + input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - asr_file = esmvalcore.local.LocalFile( - input_dir / "OBS6_SAT_sat_1_Amon_asr_1980-2000.nc", + lwcre_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - asr_file.touch() - assert dataset._derivation_necessary() + lwcre_file.touch() + + assert dataset._derivation_necessary() is True def test_force_derivation_no_derived(): @@ -2214,19 +2197,11 @@ def test_force_derivation_no_derived(): ) with pytest.raises(ValueError, match=msg): - Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", - short_name="tas", - force_derivation=True, - ) + Dataset(**OBS6_SAT_FACETS, short_name="tas", force_derivation=True) with pytest.raises(ValueError, match=msg): Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="tas", derive=False, force_derivation=True, @@ -2235,21 +2210,17 @@ def test_force_derivation_no_derived(): def test_add_supplementary_to_derived(): dataset = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="lwcre", derive=True, force_derivation=True, ) - dataset.add_supplementary(short_name="areacella", mip="fx") + dataset.add_supplementary(short_name="pr") expected_supplementary = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="fx", - short_name="areacella", + **OBS6_SAT_FACETS, + short_name="pr", derive=False, force_derivation=False, ) @@ -2258,25 +2229,21 @@ def test_add_supplementary_to_derived(): def test_add_derived_supplementary_to_derived(): dataset = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", + **OBS6_SAT_FACETS, short_name="lwcre", derive=True, force_derivation=True, ) dataset.add_supplementary( - short_name="asr", + short_name="swcre", derive=True, force_derivation=True, ) expected_supplementary = Dataset( - project="CMIP6", - dataset="CanESM5", - mip="Amon", - short_name="asr", + **OBS6_SAT_FACETS, + short_name="swcre", derive=True, force_derivation=True, ) From b971d50ce19646988a482dc58e066e272ebe8d3e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 10:12:14 +0200 Subject: [PATCH 27/85] Dataset.set_version can handle derived variables now --- esmvalcore/_recipe/recipe.py | 19 +------- esmvalcore/dataset.py | 20 +++++++-- tests/integration/recipe/test_recipe.py | 5 +++ tests/unit/recipe/test_recipe.py | 22 --------- tests/unit/test_dataset.py | 59 ++++++++++++++++++++----- 5 files changed, 71 insertions(+), 54 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index e34c7fc52b..397be02596 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -598,23 +598,6 @@ def _allow_skipping(dataset: Dataset) -> bool: ) -def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: - """Set the 'version' facet based on derivation input datasets.""" - versions = set() - for in_dataset in input_datasets: - in_dataset.set_version() - if version := in_dataset.facets.get("version"): - if isinstance(version, list): - versions.update(version) - else: - versions.add(version) - if versions: - version = versions.pop() if len(versions) == 1 else sorted(versions) - dataset.set_facet("version", version) - for supplementary_ds in dataset.supplementaries: - supplementary_ds.set_version() - - def _get_preprocessor_products( datasets: list[Dataset], profile: dict[str, Any], @@ -649,7 +632,7 @@ def _get_preprocessor_products( else: missing_vars.update(missing) continue - _set_version(dataset, input_datasets) + dataset.set_version() USED_DATASETS.append(dataset) _schedule_for_download(input_datasets) _log_input_files(input_datasets) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 90a470a376..5da0ca8eb8 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -657,15 +657,29 @@ def minimal_facets(self) -> Facets: """Return a dictionary with the persistent facets.""" return {k: v for k, v in self.facets.items() if k in self._persist} + @staticmethod + def _get_version(dataset: Dataset) -> str | list[str]: + """Get available version(s) of dataset.""" + versions: set[str] = set() + for file in dataset.files: + if "version" in file.facets: + versions.add(str(file.facets["version"])) + return versions.pop() if len(versions) == 1 else sorted(versions) + def set_version(self) -> None: """Set the ``'version'`` facet based on the available data.""" versions: set[str] = set() - for file in self.files: - if "version" in file.facets: - versions.add(file.facets["version"]) # type: ignore + for input_dataset in self.input_datasets: + version = self._get_version(input_dataset) + if version: + if isinstance(version, list): + versions.update(version) + else: + versions.add(version) version = versions.pop() if len(versions) == 1 else sorted(versions) if version: self.set_facet("version", version) + for supplementary_ds in self.supplementaries: supplementary_ds.set_version() diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index b127bcc3e7..8cf9384b39 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -131,6 +131,11 @@ def get_required(short_name, _): "get_required", get_required, ) + monkeypatch.setattr( + esmvalcore.dataset, + "get_required", + get_required, + ) DEFAULT_DOCUMENTATION = dedent(""" diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 06299781ad..640661d089 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -912,28 +912,6 @@ def test_get_default_settings(mocker): } -def test_set_version(mocker): - dataset = Dataset(short_name="tas") - supplementary = Dataset(short_name="areacella") - dataset.supplementaries = [supplementary] - - input_dataset = Dataset(short_name="tas") - file1 = mocker.Mock() - file1.facets = {"version": "v1"} - file2 = mocker.Mock() - file2.facets = {"version": "v2"} - input_dataset.files = [file1, file2] - - file3 = mocker.Mock() - file3.facets = {"version": "v3"} - supplementary.files = [file3] - - _recipe._set_version(dataset, [input_dataset]) - print(dataset) - assert dataset.facets["version"] == ["v1", "v2"] - assert dataset.supplementaries[0].facets["version"] == "v3" - - def test_extract_preprocessor_order(): profile = { "custom_order": True, diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index a3c2f35359..0cea37248d 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1609,7 +1609,7 @@ def test_find_files_non_esgf_projects(mocker, project, monkeypatch): assert tas._file_globs == mock.sentinel.file_globs -def test_set_version(): +def test_set_version_non_derived_var(): dataset = Dataset(short_name="tas") dataset.add_supplementary(short_name="areacella") file_v1 = esmvalcore.local.LocalFile("/path/to/v1/tas.nc") @@ -1625,6 +1625,53 @@ def test_set_version(): assert dataset.supplementaries[0].facets["version"] == "v3" +OBS6_SAT_FACETS = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", +} + + +def test_set_version_derive_var(monkeypatch): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="areacella") + dataset.files = [] + areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc") + areacella_file.facets["version"] = "v4" + dataset.supplementaries[0].files = [areacella_file] + + def _get_input_datasets(): + rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc") + rlut_file.facets["version"] = "v1" + rlut_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + ) + rlut_dataset.files = [rlut_file] + rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc") + rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc") + rlutcs_file_1.facets["version"] = "v2" + rlutcs_file_2.facets["version"] = "v3" + rlutcs_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + ) + rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2] + return [rlut_dataset, rlutcs_dataset] + + monkeypatch.setattr(dataset, "_get_input_datasets", _get_input_datasets) + + dataset.set_version() + + assert dataset.facets["version"] == ["v1", "v2", "v3"] + assert dataset.supplementaries[0].facets["version"] == "v4" + + @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"]) def test_update_timerange_from_esgf(mocker, timerange): esgf_files = [ @@ -2137,16 +2184,6 @@ def test_get_extra_facets_native6(): } -OBS6_SAT_FACETS = { - "project": "OBS6", - "dataset": "SAT", - "mip": "Amon", - "tier": 2, - "type": "sat", - "timerange": "1980/2000", -} - - def test_derivation_necessary_no_derivation(): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") assert dataset._derivation_necessary() is False From f6b6d22e1c35cf368de64684c3a94e8b9b371b9d Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 10:47:42 +0200 Subject: [PATCH 28/85] Dataset._input_datasets is always list[Dataset] --- esmvalcore/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 5da0ca8eb8..659cc390cf 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -133,7 +133,7 @@ def __init__(self, **facets: FacetValue) -> None: self._session: Session | None = None self._files: Sequence[File] | None = None self._file_globs: Sequence[Path] | None = None - self._input_datasets: list[Dataset] | None = None + self._input_datasets: list[Dataset] = [] for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -230,7 +230,7 @@ def input_datasets(self) -> list[Dataset]: or no files for the dataset itself are available. """ - if self._input_datasets is not None: + if self._input_datasets: return self._input_datasets if not self._derivation_necessary(): From 1f4de86795f7c585f789ca6abb3d32a51409989c Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 11:01:23 +0200 Subject: [PATCH 29/85] Make changes fully backwards-compatible --- esmvalcore/dataset.py | 20 ++++++------- tests/unit/test_dataset.py | 57 ++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 9232f00d6c..af86eb7534 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -136,16 +136,6 @@ def __init__(self, **facets: FacetValue) -> None: for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) - if not self._is_derived() and self.facets.get( - "force_derivation", - False, - ): - msg = ( - "Facet `force_derivation=True` can only be used for derived " - "variables (i.e., with facet `derive=True`)" - ) - raise ValueError(msg) - @staticmethod def from_recipe( recipe: Path | str | dict, @@ -178,6 +168,12 @@ def _is_derived(self) -> bool: """Return ``True`` for derived variables, ``False`` otherwise.""" return bool(self.facets.get("derive", False)) + def _is_force_derived(self) -> bool: + """Return ``True`` for force-derived variables, ``False`` otherwise.""" + return self._is_derived() and bool( + self.facets.get("force_derivation", False), + ) + def _derivation_necessary(self) -> bool: """Return ``True`` if derivation is necessary, ``False`` otherwise.""" # If variable cannot be derived, derivation is not necessary @@ -185,7 +181,7 @@ def _derivation_necessary(self) -> bool: return False # If forced derivation is requested, derivation is necessary - if self.facets.get("force_derivation", False): + if self._is_force_derived(): return True # Otherwise, derivation is necessary of no files for the self dataset @@ -653,7 +649,7 @@ def add_supplementary(self, **facets: FacetValue) -> None: """ if self._is_derived(): facets.setdefault("derive", False) - if self.facets.get("force_derivation", False): + if self._is_force_derived(): facets.setdefault("force_derivation", False) supplementary = self.copy(**facets) supplementary.supplementaries = [] diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 149bdb11bc..3e383f3463 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2147,6 +2147,45 @@ def test_get_extra_facets_native6(): } +def test_is_derived_no_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + assert dataset._is_derived() is False + + +def test_is_derived_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + assert dataset._is_derived() is True + + +def test_is_force_derived_no_derivation_no_force(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + assert dataset._is_force_derived() is False + + +def test_is_force_derived_no_derivation_force(): + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + force_derivation=True, + ) + assert dataset._is_force_derived() is False + + +def test_is_force_derived_derivation_no_force(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + assert dataset._is_force_derived() is False + + +def test_is_force_derived_derivation_force(): + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + assert dataset._is_force_derived() is True + + def test_derivation_necessary_no_derivation(): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") assert dataset._derivation_necessary() is False @@ -2190,24 +2229,6 @@ def test_derivation_necessary_force_derivation(tmp_path, session): assert dataset._derivation_necessary() is True -def test_force_derivation_no_derived(): - msg = ( - r"Facet `force_derivation=True` can only be used for derived " - r"variables" - ) - - with pytest.raises(ValueError, match=msg): - Dataset(**OBS6_SAT_FACETS, short_name="tas", force_derivation=True) - - with pytest.raises(ValueError, match=msg): - Dataset( - **OBS6_SAT_FACETS, - short_name="tas", - derive=False, - force_derivation=True, - ) - - def test_add_supplementary_to_derived(): dataset = Dataset( **OBS6_SAT_FACETS, From c6d303bc57517d7199711fc989188339e628d7c2 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 11:30:10 +0200 Subject: [PATCH 30/85] Make Dataset.from_files work with derived variables (no globs yet) --- esmvalcore/dataset.py | 87 +++++++++++-- tests/unit/test_dataset.py | 254 ++++++++++++++++++++++++++++++++++++- 2 files changed, 324 insertions(+), 17 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 9a21d339de..c293eac3e8 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -273,12 +273,73 @@ def _file_to_dataset( return dataset - def _get_available_datasets(self) -> Iterator[Dataset]: + def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 """Yield datasets based on the available files. This function requires that self.facets['mip'] is not a glob pattern. + + Does take variable derivation into account, i.e., datasets available + through variable derivation are returned. + """ - dataset_template = self.copy() + datasets_found = False + + # First, if no forced derivation is requested, search for datasets + # based on files from self + if not self._is_force_derived(): + for dataset in self._get_available_datasets(self): + datasets_found = True + yield dataset + + # If forced derivation is requested or no datasets based on files from + # self have been found, search for datasets based on files from input + # datasets + if self._is_force_derived() or not datasets_found: + all_datasets: list[list[tuple[dict, Dataset]]] = [] + for input_dataset in self._get_input_datasets(): + all_datasets.append([]) + for expanded_ds in self._get_available_datasets( + input_dataset, + ): + updated_facets = {} + for key, value in self.facets.items(): + if _isglob(value): + if key in expanded_ds.facets and not _isglob( + expanded_ds[key], + ): + updated_facets[key] = expanded_ds.facets[key] + new_ds = self.copy() + new_ds.facets.update(updated_facets) + new_ds.supplementaries = expanded_ds.supplementaries + + all_datasets[-1].append((updated_facets, new_ds)) + + # Only consider those datasets that contain all input variables + # necessary for derivation + for updated_facets, new_ds in all_datasets[0]: + other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] + if all(updated_facets in facets for facets in other_facets): + yield new_ds + else: + logger.debug( + "Not all necessary input variables to derive '%s' are " + "available for dataset %s", + self["short_name"], + updated_facets, + ) + + def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]: + """Yield datasets based on the available files. + + This function requires that self.facets['mip'] is not a glob pattern. + + Does not take variable derivation into account, i.e., datasets + potentially available through variable derivation are ignored. To + consider derived variables properly, use the function + :func:`_get_all_available_datasets`. + + """ + dataset_template = dataset.copy() dataset_template.supplementaries = [] if _isglob(dataset_template.facets.get("timerange")): # Remove wildcard `timerange` facet, because data finding cannot @@ -289,31 +350,31 @@ def _get_available_datasets(self) -> Iterator[Dataset]: partially_defined = [] expanded = False for file in dataset_template.files: - dataset = self._file_to_dataset(file) + new_dataset = self._file_to_dataset(file) # Filter out identical datasets facetset = frozenset( (f, frozenset(v) if isinstance(v, list) else v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() ) if facetset not in seen: seen.add(facetset) if any( _isglob(v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() if f != "timerange" ): - partially_defined.append((dataset, file)) + partially_defined.append((new_dataset, file)) else: - dataset._update_timerange() # noqa: SLF001 - dataset._supplementaries_from_files() # noqa: SLF001 + new_dataset._update_timerange() # noqa: SLF001 + new_dataset._supplementaries_from_files() # noqa: SLF001 expanded = True - yield dataset + yield new_dataset # Only yield datasets with globs if there is no better alternative - for dataset, file in partially_defined: + for new_dataset, file in partially_defined: msg = ( - f"{dataset} with unexpanded wildcards, created from file " + f"{new_dataset} with unexpanded wildcards, created from file " f"{file} with facets {file.facets}. Are the missing facets " "in the path to the file?" if isinstance(file, local.LocalFile) @@ -327,7 +388,7 @@ def _get_available_datasets(self) -> Iterator[Dataset]: "because it still contains wildcards.", msg, ) - yield dataset + yield new_dataset def from_files(self) -> Iterator[Dataset]: """Create datasets based on the available files. @@ -378,7 +439,7 @@ def from_files(self) -> Iterator[Dataset]: for mip in mips: dataset_template = self.copy(mip=mip) - for dataset in dataset_template._get_available_datasets(): # noqa: SLF001 + for dataset in dataset_template._get_all_available_datasets(): # noqa: SLF001 expanded = True yield dataset diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 3c2df18c4c..f28c0dbe79 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1181,6 +1181,252 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session): assert datasets == [expected] +@pytest.fixture +def lwcre_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre + + +@pytest.fixture +def rlut_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut + + +@pytest.fixture +def rlutcs_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlutcs = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc", + ) + rlutcs.touch() + return rlutcs + + +def test_from_files_with_derived_no_derivation(lwcre_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_input_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_input_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_input_dataset.session = session + + assert dataset.input_datasets == [expected_input_dataset] + assert expected_input_dataset.files == [lwcre_file] + + +def test_from_files_with_derived(rlut_file, rlutcs_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert dataset.input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + +def test_from_files_with_derived_no_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_input_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_input_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_input_dataset.session = session + + assert dataset.input_datasets == [expected_input_dataset] + assert expected_input_dataset.files == [lwcre_file] + + +def test_from_files_with_derived_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert dataset.input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + def test_match(): dataset1 = Dataset( short_name="areacella", @@ -2239,10 +2485,10 @@ def test_derivation_necessary_no_force_derivation(tmp_path, session): input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - lwcre = esmvalcore.local.LocalFile( + lwcre_file = esmvalcore.local.LocalFile( input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - lwcre.touch() + lwcre_file.touch() assert dataset._derivation_necessary() is False @@ -2356,10 +2602,10 @@ def test_input_datasets_no_force_derivation(tmp_path, session): input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - lwcre = esmvalcore.local.LocalFile( + lwcre_file = esmvalcore.local.LocalFile( input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - lwcre.touch() + lwcre_file.touch() assert dataset.input_datasets == [dataset] From 40147fb76267f3279c531034367d0dff1219c410 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 12:16:03 +0200 Subject: [PATCH 31/85] Added test for derived variable with glob --- tests/unit/test_dataset.py | 121 +++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index f28c0dbe79..08465f648b 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1193,6 +1193,18 @@ def lwcre_file(tmp_path): return lwcre +@pytest.fixture +def lwcre_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre + + @pytest.fixture def rlut_file(tmp_path): input_dir = tmp_path / "Tier2" / "SAT" @@ -1217,6 +1229,18 @@ def rlutcs_file(tmp_path): return rlutcs +@pytest.fixture +def pr_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + pr = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc", + ) + pr.touch() + return pr + + def test_from_files_with_derived_no_derivation(lwcre_file, session): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) @@ -1259,10 +1283,89 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session): ] expected_input_dataset.session = session - assert dataset.input_datasets == [expected_input_dataset] + assert datasets[0].input_datasets == [expected_input_dataset] assert expected_input_dataset.files == [lwcre_file] +def test_from_files_with_derived_no_derivation_glob( + lwcre_file, + lwcre_file_ground, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS_GLOB, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[1].files == [lwcre_file] + + expected_input_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_input_datasets, + strict=True, + ): + assert dataset.input_datasets == [expected] + assert expected_input_datasets[0].files == [lwcre_file_ground] + assert expected_input_datasets[1].files == [lwcre_file] + + def test_from_files_with_derived(rlut_file, rlutcs_file, session): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) @@ -1306,7 +1409,7 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): for expected_ds in expected_input_datasets: expected_ds.session = session - assert dataset.input_datasets == expected_input_datasets + assert datasets[0].input_datasets == expected_input_datasets assert expected_input_datasets[0].files == [rlut_file] assert expected_input_datasets[1].files == [rlutcs_file] @@ -1358,7 +1461,7 @@ def test_from_files_with_derived_no_force_derivation( ] expected_input_dataset.session = session - assert dataset.input_datasets == [expected_input_dataset] + assert datasets[0].input_datasets == [expected_input_dataset] assert expected_input_dataset.files == [lwcre_file] @@ -1422,7 +1525,7 @@ def test_from_files_with_derived_force_derivation( for expected_ds in expected_input_datasets: expected_ds.session = session - assert dataset.input_datasets == expected_input_datasets + assert datasets[0].input_datasets == expected_input_datasets assert expected_input_datasets[0].files == [rlut_file] assert expected_input_datasets[1].files == [rlutcs_file] @@ -1881,6 +1984,16 @@ def test_set_version_non_derived_var(): } +OBS6_SAT_FACETS_GLOB = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "*", + "timerange": "1980/2000", +} + + def test_set_version_derive_var(monkeypatch): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.add_supplementary(short_name="areacella") From 6ec04fc41c7c17fd85ec7d7f05bcdb33e743bbf6 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 12:16:39 +0200 Subject: [PATCH 32/85] Better var name --- tests/unit/test_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 3e383f3463..68e8ceed05 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2202,10 +2202,10 @@ def test_derivation_necessary_no_force_derivation(tmp_path, session): input_dir = tmp_path / "Tier2" / "SAT" input_dir.mkdir(parents=True, exist_ok=True) - lwcre = esmvalcore.local.LocalFile( + lwcre_file = esmvalcore.local.LocalFile( input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", ) - lwcre.touch() + lwcre_file.touch() assert dataset._derivation_necessary() is False From ea3386e4b062fcb7ce75ee5fe8bbd2324b4aa97d Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:12:54 +0200 Subject: [PATCH 33/85] Update esmvalcore/dataset.py Co-authored-by: Bouwe Andela --- esmvalcore/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index af86eb7534..64dd892096 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -571,7 +571,7 @@ def supplementary_summary(dataset: Dataset) -> str: return txt - def __getitem__(self, key: Any) -> FacetValue: + def __getitem__(self, key: str) -> FacetValue: """Get a facet value.""" return self.facets[key] From efa2ac1b59ee7d44dddbe4083e3147f35a2a2928 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 13:41:16 +0200 Subject: [PATCH 34/85] Add further tests for Dataset.from_files with globs --- esmvalcore/dataset.py | 3 +- tests/unit/test_dataset.py | 265 +++++++++++++++++++++++++++++++++++-- 2 files changed, 256 insertions(+), 12 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index f9934d0f23..b53445bbf0 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -323,8 +323,9 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 else: logger.debug( "Not all necessary input variables to derive '%s' are " - "available for dataset %s", + "available for %s with facets %s", self["short_name"], + new_ds.summary(shorten=True), updated_facets, ) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 08465f648b..a3467bdb6a 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,3 +1,4 @@ +import logging import textwrap from collections import defaultdict from pathlib import Path @@ -1217,6 +1218,18 @@ def rlut_file(tmp_path): return rlut +@pytest.fixture +def rlut_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut + + @pytest.fixture def rlutcs_file(tmp_path): input_dir = tmp_path / "Tier2" / "SAT" @@ -1294,7 +1307,11 @@ def test_from_files_with_derived_no_derivation_glob( session, ): """Test `from_files` with derived variable and supplementary.""" - dataset = Dataset(**OBS6_SAT_FACETS_GLOB, short_name="lwcre", derive=True) + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*"}, + short_name="lwcre", + derive=True, + ) dataset.add_supplementary(short_name="pr") dataset.session = session @@ -1314,7 +1331,9 @@ def test_from_files_with_derived_no_derivation_glob( assert datasets == expected_datasets assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] expected_input_datasets = [ Dataset( @@ -1414,6 +1433,74 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): assert expected_input_datasets[1].files == [rlutcs_file] +def test_from_files_with_derived_glob( + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, + caplog, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*"}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + with caplog.at_level(logging.DEBUG): + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] + msg = ( + "Not all necessary input variables to derive 'lwcre' are available " + "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets " + "{'type': 'ground'}" + ) + assert msg in log_debugs + + def test_from_files_with_derived_no_force_derivation( lwcre_file, rlut_file, @@ -1465,6 +1552,94 @@ def test_from_files_with_derived_no_force_derivation( assert expected_input_dataset.files == [lwcre_file] +def test_from_files_with_derived_no_force_derivation_glob( + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*"}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_input_datasets, + strict=True, + ): + assert dataset.input_datasets == [expected] + assert expected_input_datasets[0].files == [lwcre_file_ground] + assert expected_input_datasets[1].files == [lwcre_file] + + def test_from_files_with_derived_force_derivation( lwcre_file, rlut_file, @@ -1530,6 +1705,84 @@ def test_from_files_with_derived_force_derivation( assert expected_input_datasets[1].files == [rlutcs_file] +def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, + caplog, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*"}, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + with caplog.at_level(logging.DEBUG): + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_input_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert expected_input_datasets[0].files == [rlut_file] + assert expected_input_datasets[1].files == [rlutcs_file] + + log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] + msg = ( + "Not all necessary input variables to derive 'lwcre' are available " + "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets " + "{'type': 'ground'}" + ) + assert msg in log_debugs + + def test_match(): dataset1 = Dataset( short_name="areacella", @@ -1984,16 +2237,6 @@ def test_set_version_non_derived_var(): } -OBS6_SAT_FACETS_GLOB = { - "project": "OBS6", - "dataset": "SAT", - "mip": "Amon", - "tier": 2, - "type": "*", - "timerange": "1980/2000", -} - - def test_set_version_derive_var(monkeypatch): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.add_supplementary(short_name="areacella") From f9c47a940540abe0b691d30e3927164ce5523c3f Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 14:09:38 +0200 Subject: [PATCH 35/85] Update _dataset_from_files to new Dataset.from_files --- esmvalcore/_recipe/recipe.py | 3 +- esmvalcore/_recipe/to_datasets.py | 92 +++++++++---------------------- 2 files changed, 27 insertions(+), 68 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 397be02596..8d3af3ce7d 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -51,7 +51,6 @@ from . import check from .from_datasets import datasets_to_recipe from .to_datasets import ( - _derive_needed, _get_input_datasets, _representative_datasets, ) @@ -231,7 +230,7 @@ def _get_default_settings(dataset: Dataset) -> dict[str, Any]: settings = {} - if _derive_needed(dataset): + if dataset._derivation_necessary(): # noqa: SLF001 (will be replaced soon) settings["derive"] = { "short_name": facets["short_name"], "standard_name": facets["standard_name"], diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 7cd17bdbb0..458db07498 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -428,9 +428,7 @@ def datasets_from_recipe( return datasets -def _dataset_from_files( # noqa: C901 - dataset: Dataset, -) -> list[Dataset]: +def _dataset_from_files(dataset: Dataset) -> list[Dataset]: """Replace facet values of '*' based on available files.""" result: list[Dataset] = [] errors: list[str] = [] @@ -441,53 +439,32 @@ def _dataset_from_files( # noqa: C901 dataset.summary(shorten=True), ) - representative_datasets = _representative_datasets(dataset) - - # For derived variables, representative_datasets might contain more than - # one element - all_datasets: list[list[tuple[dict, Dataset]]] = [] - for representative_dataset in representative_datasets: - all_datasets.append([]) - for expanded_ds in representative_dataset.from_files(): - updated_facets = {} - unexpanded_globs = {} - for key, value in dataset.facets.items(): - if _isglob(value): - if key in expanded_ds.facets and not _isglob( - expanded_ds[key], - ): - updated_facets[key] = expanded_ds.facets[key] - else: - unexpanded_globs[key] = value - - if unexpanded_globs: - msg = _report_unexpanded_globs( - dataset, - expanded_ds, - unexpanded_globs, - ) - errors.append(msg) - continue + for expanded_ds in dataset.from_files(): + updated_facets = {} + unexpanded_globs = {} + for key, value in dataset.facets.items(): + if _isglob(value): + if key in expanded_ds.facets and not _isglob( + expanded_ds[key], + ): + updated_facets[key] = expanded_ds.facets[key] + else: + unexpanded_globs[key] = value + + if unexpanded_globs: + msg = _report_unexpanded_globs( + dataset, + expanded_ds, + unexpanded_globs, + ) + errors.append(msg) + continue - new_ds = dataset.copy() - new_ds.facets.update(updated_facets) - new_ds.supplementaries = expanded_ds.supplementaries + new_ds = dataset.copy() + new_ds.facets.update(updated_facets) + new_ds.supplementaries = expanded_ds.supplementaries - all_datasets[-1].append((updated_facets, new_ds)) - - # If globs have been expanded, only consider those datasets that contain - # all necessary input variables if derivation is necessary - for updated_facets, new_ds in all_datasets[0]: - other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] - if all(updated_facets in facets for facets in other_facets): - result.append(new_ds) - else: - logger.debug( - "Not all necessary input variables to derive '%s' are " - "available for dataset %s", - dataset["short_name"], - updated_facets, - ) + result.append(new_ds) if errors: raise RecipeError("\n".join(errors)) @@ -538,27 +515,10 @@ def _report_unexpanded_globs( return msg -def _derive_needed(dataset: Dataset) -> bool: - """Check if dataset needs to be derived from other datasets.""" - if not dataset.facets.get("derive"): - return False - if dataset.facets.get("force_derivation"): - return True - if _isglob(dataset.facets.get("timerange", "")): - # Our file finding routines are not able to handle globs. - dataset = dataset.copy() - dataset.facets.pop("timerange") - - copy = dataset.copy() - copy.supplementaries = [] - return not copy.files - - def _get_input_datasets(dataset: Dataset) -> list[Dataset]: """Determine the input datasets needed for deriving `dataset`.""" facets = dataset.facets - if not _derive_needed(dataset): - _fix_cmip5_fx_ensemble(dataset) + if not dataset._derivation_necessary(): # noqa: SLF001 return [dataset] # Configure input datasets needed to derive variable From 3de7bc85ca1ffe0fe0fc5dfe157f971e72beda0b Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 14:10:18 +0200 Subject: [PATCH 36/85] Move _fix_cmip5_fx_ensemble to _get_preprocessor_products --- esmvalcore/_recipe/recipe.py | 23 +++++++++++++++++++++++ esmvalcore/_recipe/to_datasets.py | 23 ----------------------- tests/unit/recipe/test_recipe.py | 20 ++++++++++++++++++++ tests/unit/recipe/test_to_datasets.py | 20 -------------------- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 8d3af3ce7d..700e205e81 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -597,6 +597,28 @@ def _allow_skipping(dataset: Dataset) -> bool: ) +def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: + """Automatically correct the wrong ensemble for CMIP5 fx variables.""" + if ( + dataset.facets.get("project") == "CMIP5" + and dataset.facets.get("mip") == "fx" + and dataset.facets.get("ensemble") != "r0i0p0" + and not dataset.files + ): + original_ensemble = dataset["ensemble"] + copy = dataset.copy() + copy.facets["ensemble"] = "r0i0p0" + if copy.files: + dataset.facets["ensemble"] = "r0i0p0" + logger.info( + "Corrected wrong 'ensemble' from '%s' to '%s' for %s", + original_ensemble, + dataset["ensemble"], + dataset.summary(shorten=True), + ) + dataset.find_files() + + def _get_preprocessor_products( datasets: list[Dataset], profile: dict[str, Any], @@ -620,6 +642,7 @@ def _get_preprocessor_products( settings = _get_default_settings(dataset) _apply_preprocessor_profile(settings, profile) _update_multi_dataset_settings(dataset.facets, settings) + _fix_cmip5_fx_ensemble(dataset) _update_preproc_functions(settings, dataset, datasets, missing_vars) _add_dataset_specific_settings(dataset, settings) check.preprocessor_supplementaries(dataset, settings) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 458db07498..f6bf57fbe2 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -188,28 +188,6 @@ def _merge_supplementary_dicts( return list(merged.values()) -def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: - """Automatically correct the wrong ensemble for CMIP5 fx variables.""" - if ( - dataset.facets.get("project") == "CMIP5" - and dataset.facets.get("mip") == "fx" - and dataset.facets.get("ensemble") != "r0i0p0" - and not dataset.files - ): - original_ensemble = dataset["ensemble"] - copy = dataset.copy() - copy.facets["ensemble"] = "r0i0p0" - if copy.files: - dataset.facets["ensemble"] = "r0i0p0" - logger.info( - "Corrected wrong 'ensemble' from '%s' to '%s' for %s", - original_ensemble, - dataset["ensemble"], - dataset.summary(shorten=True), - ) - dataset.find_files() - - def _get_supplementary_short_names( facets: Facets, step: str, @@ -534,7 +512,6 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]: } input_dataset.facets.update(input_facets) input_dataset.augment_facets() - _fix_cmip5_fx_ensemble(input_dataset) if input_facets.get("optional") and not input_dataset.files: logger.info( "Skipping: no data found for %s which is marked as 'optional'", diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 640661d089..d7c6c90178 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -965,3 +965,23 @@ def test_update_extract_shape_rel_shapefile(shapefile, session, tmp_path): / "ar6.shp" ) assert settings["extract_shape"]["shapefile"] == ar6_file + + +def test_fix_cmip5_fx_ensemble(monkeypatch): + def find_files(self): + if self.facets["ensemble"] == "r0i0p0": + self._files = ["file1.nc"] + + monkeypatch.setattr(Dataset, "find_files", find_files) + + dataset = Dataset( + dataset="dataset1", + short_name="orog", + mip="fx", + project="CMIP5", + ensemble="r1i1p1", + ) + + _recipe._fix_cmip5_fx_ensemble(dataset) + + assert dataset["ensemble"] == "r0i0p0" diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index 20439a1d07..f9d01881ca 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -347,26 +347,6 @@ def from_files(_): to_datasets._dataset_from_files(dataset) -def test_fix_cmip5_fx_ensemble(monkeypatch): - def find_files(self): - if self.facets["ensemble"] == "r0i0p0": - self._files = ["file1.nc"] - - monkeypatch.setattr(Dataset, "find_files", find_files) - - dataset = Dataset( - dataset="dataset1", - short_name="orog", - mip="fx", - project="CMIP5", - ensemble="r1i1p1", - ) - - to_datasets._fix_cmip5_fx_ensemble(dataset) - - assert dataset["ensemble"] == "r0i0p0" - - def test_get_supplementary_short_names(monkeypatch): def _update_cmor_facets(facets): facets["modeling_realm"] = "atmos" From 77fd1e85eb03422817ec87046bfaa63d3dc17209 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 15:30:12 +0200 Subject: [PATCH 37/85] Make _derivation_necessary work with timerange globs --- esmvalcore/dataset.py | 6 ++++++ tests/unit/test_dataset.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index b53445bbf0..ea35d040e0 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -190,6 +190,12 @@ def _derivation_necessary(self) -> bool: # are found ds_copy = self.copy() ds_copy.supplementaries = [] + + # Avoid potential errors from missing data during timerange glob + # expansion + if _isglob(ds_copy.facets.get("timerange", "")): + ds_copy.facets.pop("timerange", None) + return not ds_copy.files def _get_input_datasets(self) -> list[Dataset]: diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index a3467bdb6a..23a43336fa 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2835,6 +2835,15 @@ def test_derivation_necessary_no_force_derivation_no_files(): assert dataset._derivation_necessary() is True +def test_derivation_necessary_no_force_derivation_no_files_glob(): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + assert dataset._derivation_necessary() is True + + def test_derivation_necessary_no_force_derivation(tmp_path, session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.session = session From 312fafa35935b39bce38717afd6537358ca22dca Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 16:24:41 +0200 Subject: [PATCH 38/85] Fix bug for non-derived variables --- esmvalcore/dataset.py | 8 ++++++-- tests/unit/test_dataset.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index ea35d040e0..39e2d84494 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -290,13 +290,17 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 """ datasets_found = False - # First, if no forced derivation is requested, search for datasets - # based on files from self + # If no forced derivation is requested, search for datasets based on + # files from self if not self._is_force_derived(): for dataset in self._get_available_datasets(self): datasets_found = True yield dataset + # For variables that cannot be derived, we are done here + if not self._is_derived(): + return + # If forced derivation is requested or no datasets based on files from # self have been found, search for datasets based on files from input # datasets diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 23a43336fa..6ad6d78d97 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1182,6 +1182,22 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session): assert datasets == [expected] +def test_from_files_no_files_glob(session): + dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas") + datasets = list(dataset.from_files()) + assert datasets == [dataset] + + +def test_from_files_derived_no_files_glob(session): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*"}, + short_name="lwcre", + derive=True, + ) + datasets = list(dataset.from_files()) + assert datasets == [dataset] + + @pytest.fixture def lwcre_file(tmp_path): input_dir = tmp_path / "Tier2" / "SAT" From e8c7bf2a56f3bd9d706ef0667379654069c6c7c1 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 16:25:28 +0200 Subject: [PATCH 39/85] Use new Dataset.from_files in code --- esmvalcore/_recipe/check.py | 21 ++++++++++------ esmvalcore/_recipe/to_datasets.py | 33 ++++++------------------- tests/integration/recipe/test_check.py | 32 +++++++++++------------- tests/integration/recipe/test_recipe.py | 20 +++++++-------- 4 files changed, 46 insertions(+), 60 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index e79db4f2ee..fdebefb6a1 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -36,7 +36,7 @@ from esmvalcore._task import TaskSet from esmvalcore.dataset import Dataset - from esmvalcore.typing import Facets, FacetValue + from esmvalcore.typing import FacetValue logger = logging.getLogger(__name__) @@ -467,15 +467,22 @@ def valid_time_selection(timerange: str) -> None: def differing_timeranges( - timeranges: set[FacetValue], - required_vars: list[Facets], + var_to_derive: FacetValue, + input_datasets: list[Dataset], ) -> None: - """Log error if required variables have differing timeranges.""" + """Log error if input datasets have differing timeranges.""" + timeranges: set[FacetValue] = set() + for input_dataset in input_datasets: + if "timerange" in input_dataset.facets: + timeranges.add(input_dataset.facets["timerange"]) if len(timeranges) > 1: + input_datasets_str = "; ".join( + d.summary(shorten=True) for d in input_datasets + ) msg = ( - f"Differing timeranges with values {timeranges} " - f"found for required variables {required_vars}. " - "Set `timerange` to a common value." + f"Differing timeranges with values {timeranges} found for " + f"datasets {input_datasets_str} necessary to derive " + f"'{var_to_derive}'. Set `timerange` to a common value." ) raise ValueError(msg) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index f6bf57fbe2..a5bf3d6427 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -13,7 +13,6 @@ from esmvalcore.esgf.facets import FACETS from esmvalcore.exceptions import RecipeError from esmvalcore.local import LocalFile, _replace_years_with_timerange -from esmvalcore.preprocessor._derive import get_required from esmvalcore.preprocessor._io import DATASET_KEYS from esmvalcore.preprocessor._supplementary_vars import ( PREPROCESSOR_SUPPLEMENTARIES, @@ -495,39 +494,23 @@ def _report_unexpanded_globs( def _get_input_datasets(dataset: Dataset) -> list[Dataset]: """Determine the input datasets needed for deriving `dataset`.""" - facets = dataset.facets if not dataset._derivation_necessary(): # noqa: SLF001 - return [dataset] + return dataset.input_datasets - # Configure input datasets needed to derive variable - datasets = [] - required_vars = get_required(facets["short_name"], facets["project"]) - # idea: add option to specify facets in list of dicts that is value of - # 'derive' in the recipe and use that instead of get_required? - for input_facets in required_vars: - input_dataset = dataset.copy() - keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets} - input_dataset.facets = { - k: v for k, v in input_dataset.facets.items() if k in keep - } - input_dataset.facets.update(input_facets) - input_dataset.augment_facets() - if input_facets.get("optional") and not input_dataset.files: + # Skip optional datasets if no data is available + input_datasets: list[Dataset] = [] + for input_dataset in dataset.input_datasets: + if input_dataset.facets.get("optional") and not input_dataset.files: logger.info( "Skipping: no data found for %s which is marked as 'optional'", input_dataset, ) else: - datasets.append(input_dataset) + input_datasets.append(input_dataset) - # Check timeranges of available input data. - timeranges: set[FacetValue] = set() - for input_dataset in datasets: - if "timerange" in input_dataset.facets: - timeranges.add(input_dataset.facets["timerange"]) - check.differing_timeranges(timeranges, required_vars) + check.differing_timeranges(dataset.facets["short_name"], input_datasets) - return datasets + return input_datasets def _representative_datasets(dataset: Dataset) -> list[Dataset]: diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index 551603a446..bfa2097c30 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -274,25 +274,23 @@ def test_valid_time_selection_rejections(timerange, message): assert str(rec_err.value) == message -def test_differing_timeranges(caplog): - timeranges = set() - timeranges.add("1950/1951") - timeranges.add("1950/1952") - required_variables = [ - {"short_name": "rsdscs", "timerange": "1950/1951"}, - {"short_name": "rsuscs", "timerange": "1950/1952"}, +def test_differing_timeranges(): + facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + } + input_datasets = [ + Dataset(**facets, short_name="rlut", timerange="1950/1952"), + Dataset(**facets, short_name="rlutcs", timerange="1951/1953"), + Dataset(**facets, short_name="rlut"), ] - with pytest.raises(ValueError) as exc: - check.differing_timeranges(timeranges, required_variables) - expected_log = ( - f"Differing timeranges with values {timeranges} " - "found for required variables " - "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, " - "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. " - "Set `timerange` to a common value." - ) - assert expected_log in str(exc.value) + msg = r"Differing timeranges with values {'1950/1952', '1951/1953'}" + with pytest.raises(ValueError, match=msg): + check.differing_timeranges("lwcre", input_datasets) def test_data_availability_nonexistent(tmp_path): diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 8cf9384b39..7ab85581cb 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -126,11 +126,6 @@ def get_required(short_name, _): {"short_name": "areacella", "mip": "fx", "optional": True}, ] - monkeypatch.setattr( - esmvalcore._recipe.to_datasets, - "get_required", - get_required, - ) monkeypatch.setattr( esmvalcore.dataset, "get_required", @@ -2543,9 +2538,7 @@ def test_representative_dataset_derived_var( expected_facets = { # Already present in variable "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": force_derivation, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2555,6 +2548,9 @@ def test_representative_dataset_derived_var( "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_input_datasets() + "derive": False, + "force_derivation": False, } if force_derivation: expected_datasets = [ @@ -2609,9 +2605,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsdscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2624,6 +2618,9 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_input_datasets() + "derive": False, + "force_derivation": False, } rsdscs = Dataset(**rsdscs_facets) rsdscs.session = session @@ -2633,9 +2630,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsuscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2648,6 +2643,9 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_input_datasets() + "derive": False, + "force_derivation": False, } rsuscs = Dataset(**rsuscs_facets) rsuscs.session = session From 6cdd7141d9a9865a3dc3b1561abc7877d8fca6aa Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 16:39:13 +0200 Subject: [PATCH 40/85] Added test to check differing timeranges --- tests/integration/recipe/test_check.py | 2 +- tests/unit/recipe/test_to_datasets.py | 52 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index bfa2097c30..2c2f1ea745 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -288,7 +288,7 @@ def test_differing_timeranges(): Dataset(**facets, short_name="rlut"), ] - msg = r"Differing timeranges with values {'1950/1952', '1951/1953'}" + msg = r"Differing timeranges with values" with pytest.raises(ValueError, match=msg): check.differing_timeranges("lwcre", input_datasets) diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index f9d01881ca..2e560765f1 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -1,3 +1,4 @@ +import logging import textwrap from pathlib import Path @@ -302,6 +303,57 @@ def test_get_input_datasets_derive(session): assert rlns["frequency"] == "1hr" +def test_get_input_datasets_derive_optional(caplog, tmp_path, session): + facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "SImon", + "short_name": "siextent", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", + "derive": True, + } + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + sic_file = LocalFile( + input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc", + ) + sic_file.touch() + + dataset = Dataset(**facets) + dataset.files = [] + dataset.session = session + + with caplog.at_level(logging.INFO): + datasets = to_datasets._get_input_datasets(dataset) + + expected = Dataset( + dataset="SAT", + project="OBS6", + mip="SImon", + short_name="siconca", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Atmospheric Grid)", + modeling_realm=["seaIce"], + optional="true", + original_short_name="siconca", + standard_name="sea_ice_area_fraction", + tier=2, + timerange="1980/2000", + type="sat", + units="%", + ) + expected.session = session + + assert datasets == [expected] + + logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"] + assert "which is marked as 'optional'" in logger_infos[-1] + + def test_max_years(session): recipe_txt = textwrap.dedent(""" diagnostics: From 9057cf9d115e3978b949a9b740f046628db3f3e9 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 17:35:59 +0200 Subject: [PATCH 41/85] Make everything work with glob in timerange --- esmvalcore/dataset.py | 29 +++++++++++------------ tests/unit/test_dataset.py | 47 +++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 39e2d84494..fc99e9c9e3 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -243,41 +243,42 @@ def input_datasets(self) -> list[Dataset]: self._input_datasets = input_datasets return input_datasets + @staticmethod def _file_to_dataset( - self, + dataset: Dataset, file: esgf.ESGFFile | local.LocalFile, ) -> Dataset: """Create a dataset from a file with a `facets` attribute.""" facets = dict(file.facets) - if "version" not in self.facets: + if "version" not in dataset.facets: # Remove version facet if no specific version requested facets.pop("version", None) updated_facets = { f: v for f, v in facets.items() - if f in self.facets - and _isglob(self.facets[f]) - and _ismatch(v, self.facets[f]) + if f in dataset.facets + and _isglob(dataset.facets[f]) + and _ismatch(v, dataset.facets[f]) } - dataset = self.copy() - dataset.facets.update(updated_facets) + new_dataset = dataset.copy() + new_dataset.facets.update(updated_facets) # If possible, remove unexpanded facets that can be automatically # populated. - unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)} + unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)} required_for_augment = {"project", "mip", "short_name", "dataset"} if unexpanded and not unexpanded & required_for_augment: - copy = dataset.copy() + copy = new_dataset.copy() copy.supplementaries = [] for facet in unexpanded: copy.facets.pop(facet) copy.augment_facets() for facet in unexpanded: if facet in copy.facets: - dataset.facets.pop(facet) + new_dataset.facets.pop(facet) - return dataset + return new_dataset def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 """Yield datasets based on the available files. @@ -320,7 +321,7 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 updated_facets[key] = expanded_ds.facets[key] new_ds = self.copy() new_ds.facets.update(updated_facets) - new_ds.supplementaries = expanded_ds.supplementaries + new_ds.supplementaries = self.supplementaries all_datasets[-1].append((updated_facets, new_ds)) @@ -361,7 +362,7 @@ def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]: partially_defined = [] expanded = False for file in dataset_template.files: - new_dataset = self._file_to_dataset(file) + new_dataset = self._file_to_dataset(dataset, file) # Filter out identical datasets facetset = frozenset( @@ -378,7 +379,6 @@ def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]: partially_defined.append((new_dataset, file)) else: new_dataset._update_timerange() # noqa: SLF001 - new_dataset._supplementaries_from_files() # noqa: SLF001 expanded = True yield new_dataset @@ -451,6 +451,7 @@ def from_files(self) -> Iterator[Dataset]: for mip in mips: dataset_template = self.copy(mip=mip) for dataset in dataset_template._get_all_available_datasets(): # noqa: SLF001 + dataset._supplementaries_from_files() # noqa: SLF001 expanded = True yield dataset diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 6ad6d78d97..08e24414a2 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1188,9 +1188,10 @@ def test_from_files_no_files_glob(session): assert datasets == [dataset] -def test_from_files_derived_no_files_glob(session): +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_files_glob(timerange, session): dataset = Dataset( - **{**OBS6_SAT_FACETS, "type": "*"}, + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, short_name="lwcre", derive=True, ) @@ -1316,7 +1317,9 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session): assert expected_input_dataset.files == [lwcre_file] +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) def test_from_files_with_derived_no_derivation_glob( + timerange, lwcre_file, lwcre_file_ground, pr_file, @@ -1324,7 +1327,7 @@ def test_from_files_with_derived_no_derivation_glob( ): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( - **{**OBS6_SAT_FACETS, "type": "*"}, + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, short_name="lwcre", derive=True, ) @@ -1449,7 +1452,9 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): assert expected_input_datasets[1].files == [rlutcs_file] +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) def test_from_files_with_derived_glob( + timerange, rlut_file, rlut_file_ground, rlutcs_file, @@ -1459,7 +1464,7 @@ def test_from_files_with_derived_glob( ): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( - **{**OBS6_SAT_FACETS, "type": "*"}, + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, short_name="lwcre", derive=True, ) @@ -1509,12 +1514,12 @@ def test_from_files_with_derived_glob( assert expected_input_datasets[1].files == [rlutcs_file] log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] - msg = ( - "Not all necessary input variables to derive 'lwcre' are available " - "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets " - "{'type': 'ground'}" - ) - assert msg in log_debugs + msg = "Not all necessary input variables to derive 'lwcre' are available" + for log_debug in log_debugs: + if msg in log_debug: + break + else: + pytest.fail(f"No debug message '{msg}'") def test_from_files_with_derived_no_force_derivation( @@ -1568,7 +1573,9 @@ def test_from_files_with_derived_no_force_derivation( assert expected_input_dataset.files == [lwcre_file] -def test_from_files_with_derived_no_force_derivation_glob( +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 + timerange, lwcre_file, lwcre_file_ground, rlut_file, @@ -1579,7 +1586,7 @@ def test_from_files_with_derived_no_force_derivation_glob( ): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( - **{**OBS6_SAT_FACETS, "type": "*"}, + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, short_name="lwcre", derive=True, ) @@ -1721,7 +1728,9 @@ def test_from_files_with_derived_force_derivation( assert expected_input_datasets[1].files == [rlutcs_file] +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 + timerange, lwcre_file, lwcre_file_ground, rlut_file, @@ -1733,7 +1742,7 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 ): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( - **{**OBS6_SAT_FACETS, "type": "*"}, + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, short_name="lwcre", derive=True, force_derivation=True, @@ -1791,12 +1800,12 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 assert expected_input_datasets[1].files == [rlutcs_file] log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] - msg = ( - "Not all necessary input variables to derive 'lwcre' are available " - "for Dataset: lwcre, Amon, OBS6, SAT, supplementaries: pr with facets " - "{'type': 'ground'}" - ) - assert msg in log_debugs + msg = "Not all necessary input variables to derive 'lwcre' are available" + for log_debug in log_debugs: + if msg in log_debug: + break + else: + pytest.fail(f"No debug message '{msg}'") def test_match(): From ebc82ba42f1e83e4800ebb0b7d6ebcf6f8ac094e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 17:44:36 +0200 Subject: [PATCH 42/85] Differing timeranges are caught by _get_all_available_datasets --- esmvalcore/_recipe/check.py | 22 ---------------------- esmvalcore/_recipe/to_datasets.py | 2 -- tests/integration/recipe/test_check.py | 19 ------------------- 3 files changed, 43 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index fdebefb6a1..aafd4a0e3a 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -36,7 +36,6 @@ from esmvalcore._task import TaskSet from esmvalcore.dataset import Dataset - from esmvalcore.typing import FacetValue logger = logging.getLogger(__name__) @@ -466,27 +465,6 @@ def valid_time_selection(timerange: str) -> None: _check_timerange_values(date, timerange_list) -def differing_timeranges( - var_to_derive: FacetValue, - input_datasets: list[Dataset], -) -> None: - """Log error if input datasets have differing timeranges.""" - timeranges: set[FacetValue] = set() - for input_dataset in input_datasets: - if "timerange" in input_dataset.facets: - timeranges.add(input_dataset.facets["timerange"]) - if len(timeranges) > 1: - input_datasets_str = "; ".join( - d.summary(shorten=True) for d in input_datasets - ) - msg = ( - f"Differing timeranges with values {timeranges} found for " - f"datasets {input_datasets_str} necessary to derive " - f"'{var_to_derive}'. Set `timerange` to a common value." - ) - raise ValueError(msg) - - def _check_literal( settings: dict, *, diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index a5bf3d6427..7619044107 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -508,8 +508,6 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]: else: input_datasets.append(input_dataset) - check.differing_timeranges(dataset.facets["short_name"], input_datasets) - return input_datasets diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index 2c2f1ea745..6aec456f80 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -274,25 +274,6 @@ def test_valid_time_selection_rejections(timerange, message): assert str(rec_err.value) == message -def test_differing_timeranges(): - facets = { - "project": "OBS6", - "dataset": "SAT", - "mip": "Amon", - "tier": 2, - "type": "sat", - } - input_datasets = [ - Dataset(**facets, short_name="rlut", timerange="1950/1952"), - Dataset(**facets, short_name="rlutcs", timerange="1951/1953"), - Dataset(**facets, short_name="rlut"), - ] - - msg = r"Differing timeranges with values" - with pytest.raises(ValueError, match=msg): - check.differing_timeranges("lwcre", input_datasets) - - def test_data_availability_nonexistent(tmp_path): var = { "dataset": "ABC", From 58dd66645e33d1cbba99672b4eba1cff98a757e7 Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Tue, 15 Jul 2025 18:54:25 +0200 Subject: [PATCH 43/85] Use ABCs and other type hint suggestions from @bouweandela Co-authored-by: Bouwe Andela --- doc/develop/derivation.rst | 2 +- esmvalcore/_recipe/recipe.py | 22 +++++++++---------- esmvalcore/_recipe/to_datasets.py | 4 ++-- esmvalcore/local.py | 2 +- esmvalcore/preprocessor/__init__.py | 6 ++--- esmvalcore/preprocessor/_derive/__init__.py | 2 +- esmvalcore/preprocessor/_derive/_baseclass.py | 2 +- esmvalcore/preprocessor/_derive/ohc.py | 2 +- esmvalcore/preprocessor/_derive/vegfrac.py | 2 +- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/doc/develop/derivation.rst b/doc/develop/derivation.rst index c8516d8414..44ff9b8e2b 100644 --- a/doc/develop/derivation.rst +++ b/doc/develop/derivation.rst @@ -26,7 +26,7 @@ A typical example looks like this: """Derivation of variable `dummy`.""" @staticmethod - def required(project: FacetValue) -> list[Facets]: + def required(project: str) -> list[Facets]: """Declare the variables needed for derivation.""" mip = "fx" if project == "CMIP6": diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index e34c7fc52b..2992c7156e 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -104,7 +104,7 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str: def _update_target_levels( dataset: Dataset, - datasets: list[Dataset], + datasets: Sequence[Dataset], settings: dict[str, Any], ) -> None: """Replace the target levels dataset name with a filename if needed.""" @@ -142,7 +142,7 @@ def _update_target_levels( def _update_target_grid( dataset: Dataset, - datasets: list[Dataset], + datasets: Sequence[Dataset], settings: dict[str, Any], ) -> None: """Replace the target grid dataset name with a filename if needed.""" @@ -178,7 +178,7 @@ def _update_regrid_time(dataset: Dataset, settings: dict) -> None: settings["regrid_time"]["frequency"] = dataset.facets["frequency"] -def _select_dataset(dataset_name: str, datasets: list[Dataset]) -> Dataset: +def _select_dataset(dataset_name: str, datasets: Sequence[Dataset]) -> Dataset: for dataset in datasets: if dataset.facets["dataset"] == dataset_name: return dataset @@ -192,7 +192,7 @@ def _select_dataset(dataset_name: str, datasets: list[Dataset]) -> Dataset: def _limit_datasets( - datasets: list[Dataset], + datasets: Sequence[Dataset], profile: dict[str, Any], ) -> list[Dataset]: """Try to limit the number of datasets to max_datasets.""" @@ -321,7 +321,7 @@ def _add_to_download_list(dataset: Dataset) -> None: dataset.files[i] = file.local_file(dataset.session["download_dir"]) -def _schedule_for_download(datasets: list[Dataset]) -> None: +def _schedule_for_download(datasets: Iterable[Dataset]) -> None: """Schedule files for download.""" for dataset in datasets: _add_to_download_list(dataset) @@ -618,7 +618,7 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: def _get_preprocessor_products( datasets: list[Dataset], profile: dict[str, Any], - order: tuple[str, ...], + order: Sequence[str, ...], name: str, ) -> set[PreprocessorFile]: """Get preprocessor product definitions for a set of datasets. @@ -935,7 +935,7 @@ def _log_recipe_errors(self, exc: RecipeError) -> None: ) @staticmethod - def _need_ncl(raw_diagnostics: dict[str, Any]) -> bool: + def _need_ncl(raw_diagnostics: dict[str, dict[str, Any]]) -> bool: if not raw_diagnostics: return False for diagnostic in raw_diagnostics.values(): @@ -989,8 +989,8 @@ def _initialize_diagnostics( def _initialize_scripts( self, diagnostic_name: str, - raw_scripts: dict[str, Any], - variable_names: tuple[str, Any], + raw_scripts: dict[str, dict[str, Any]], + variable_names: Sequence[str], ) -> dict[str, Any]: """Define script in diagnostic.""" if not raw_scripts: @@ -1042,7 +1042,7 @@ def _initialize_scripts( def _resolve_diagnostic_ancestors( self, - tasks: Iterable[PreprocessingTask], + tasks: Iterable[BaseTask], ) -> None: """Resolve diagnostic ancestors.""" tasks = {t.name: t for t in tasks} @@ -1117,7 +1117,7 @@ def _update_with_ancestors(self, tasknames_to_run: set[str]) -> bool: def _create_diagnostic_tasks( self, diagnostic_name: str, - diagnostic: dict[str, Any], + diagnostic: Diagnostic, tasknames_to_run: set[str], ) -> list[BaseTask]: """Create diagnostic tasks.""" diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 7cd17bdbb0..c25d4026ad 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -52,7 +52,7 @@ def _facet_to_str(facet_value: FacetValue | None) -> str: return str(facet_value) -def _set_alias(variables: list[list[Dataset]]) -> None: +def _set_alias(variables: Sequence[Sequence[Dataset]]) -> None: """Add unique alias for datasets. Generates a unique alias for each dataset that will be shared by all @@ -584,7 +584,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]: datasets.append(input_dataset) # Check timeranges of available input data. - timeranges: set[FacetValue] = set() + timeranges: set[str] = set() for input_dataset in datasets: if "timerange" in input_dataset.facets: timeranges.add(input_dataset.facets["timerange"]) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 9b30df924b..d58d68a9b1 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -324,7 +324,7 @@ def _truncate_dates(date: str, file_date: str) -> tuple[int, int]: def _select_files( - filenames: list[LocalFile], + filenames: Iterable[LocalFile], timerange: FacetValue, ) -> list[LocalFile]: """Select files containing data between a given timerange. diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 3f275fb439..3147639ae0 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -364,7 +364,7 @@ def _run_preproc_function( function: Callable, items: Any, kwargs: Any, - input_files: list[File] | None = None, + input_files: Sequence[File] | None = None, ) -> Any: """Run preprocessor function.""" kwargs_str = ",\n".join( @@ -426,7 +426,7 @@ def _run_preproc_function( def preprocess( - items: list[PreprocessorFile | Cube | str | Path], + items: Sequence[PreprocessorFile | Cube | str | Path], step: str, input_files: list[File] | None = None, output_file: Path | None = None, @@ -484,7 +484,7 @@ def preprocess( def get_step_blocks( steps: Iterable[str], - order: list[str], + order: Sequence[str], ) -> list[list[str]]: """Group steps into execution blocks.""" blocks: list[list[str]] = [] diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index cd209f88de..a27a809a21 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -41,7 +41,7 @@ def _get_all_derived_variables() -> dict[str, type[DerivedVariableBase]]: __all__ = list(ALL_DERIVED_VARIABLES) -def get_required(short_name: FacetValue, project: FacetValue) -> list[Facets]: +def get_required(short_name: str, project: str) -> list[Facets]: """Return all required variables for derivation. Get all information (at least ``short_name``) required for derivation. diff --git a/esmvalcore/preprocessor/_derive/_baseclass.py b/esmvalcore/preprocessor/_derive/_baseclass.py index 4e71f66dd6..b050921801 100644 --- a/esmvalcore/preprocessor/_derive/_baseclass.py +++ b/esmvalcore/preprocessor/_derive/_baseclass.py @@ -12,7 +12,7 @@ class DerivedVariableBase: @staticmethod @abstractmethod - def required(project: FacetValue) -> list[Facets]: + def required(project: str) -> list[Facets]: """Return required variables for derivation. This method needs to be overridden in the child class belonging to the diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py index 87643d2d1b..1bd58c337f 100644 --- a/esmvalcore/preprocessor/_derive/ohc.py +++ b/esmvalcore/preprocessor/_derive/ohc.py @@ -16,7 +16,7 @@ class DerivedVariable(DerivedVariableBase): """Derivation of variable `ohc`.""" @staticmethod - def required(project: FacetValue) -> list[Facets]: + def required(project: str) -> list[Facets]: """Declare the variables needed for derivation.""" volcello: Facets = {"short_name": "volcello", "mip": "fx"} if project == "CMIP5": diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py index edd4dce75d..007ab406f9 100644 --- a/esmvalcore/preprocessor/_derive/vegfrac.py +++ b/esmvalcore/preprocessor/_derive/vegfrac.py @@ -15,7 +15,7 @@ class DerivedVariable(DerivedVariableBase): """Derivation of variable `vegFrac`.""" @staticmethod - def required(project: FacetValue) -> list[Facets]: + def required(project: str) -> list[Facets]: """Declare the variables needed for derivation.""" sftlf: Facets = {"short_name": "sftlf", "mip": "fx"} if project == "CMIP5": From b1c66fdbf5ce3e2cbbc52489cf74ac7830f268b8 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:00:18 +0200 Subject: [PATCH 44/85] Ruff fixes --- esmvalcore/_recipe/recipe.py | 4 +- esmvalcore/_recipe/to_datasets.py | 2 +- esmvalcore/preprocessor/__init__.py | 44 ++++++++++--------- esmvalcore/preprocessor/_derive/__init__.py | 2 +- esmvalcore/preprocessor/_derive/_baseclass.py | 2 +- esmvalcore/preprocessor/_derive/ohc.py | 2 +- esmvalcore/preprocessor/_derive/vegfrac.py | 2 +- 7 files changed, 31 insertions(+), 27 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 2992c7156e..09cb8d850c 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -57,7 +57,7 @@ ) if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Iterable, Sequence from esmvalcore.config import Session from esmvalcore.typing import Facets @@ -66,6 +66,8 @@ PreprocessorSettings = dict[str, Any] +Diagnostic = dict[str, Any] + DOWNLOAD_FILES = set() """Use a global variable to keep track of files that need to be downloaded.""" diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index c25d4026ad..d37b5a271f 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging -from collections.abc import Iterable, Iterator +from collections.abc import Iterable, Iterator, Sequence from copy import deepcopy from numbers import Number from typing import TYPE_CHECKING, Any diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 3147639ae0..40b835b3ed 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -15,8 +15,7 @@ from esmvalcore._task import BaseTask from esmvalcore.cmor.check import cmor_check_data, cmor_check_metadata from esmvalcore.cmor.fix import fix_data, fix_file, fix_metadata - -from ._area import ( +from esmvalcore.preprocessor._area import ( area_statistics, extract_named_regions, extract_region, @@ -24,20 +23,20 @@ meridional_statistics, zonal_statistics, ) -from ._compare_with_refs import bias, distance_metric -from ._concatenate import concatenate -from ._cycles import amplitude -from ._dask_progress import _compute_with_progress -from ._derive import derive -from ._detrend import detrend -from ._io import ( +from esmvalcore.preprocessor._compare_with_refs import bias, distance_metric +from esmvalcore.preprocessor._concatenate import concatenate +from esmvalcore.preprocessor._cycles import amplitude +from esmvalcore.preprocessor._dask_progress import _compute_with_progress +from esmvalcore.preprocessor._derive import derive +from esmvalcore.preprocessor._detrend import detrend +from esmvalcore.preprocessor._io import ( _get_debug_filename, _sort_products, load, save, write_metadata, ) -from ._mask import ( +from esmvalcore.preprocessor._mask import ( mask_above_threshold, mask_below_threshold, mask_fillvalues, @@ -48,21 +47,24 @@ mask_multimodel, mask_outside_range, ) -from ._multimodel import ensemble_statistics, multi_model_statistics -from ._other import clip, cumulative_sum, histogram -from ._regrid import ( +from esmvalcore.preprocessor._multimodel import ( + ensemble_statistics, + multi_model_statistics, +) +from esmvalcore.preprocessor._other import clip, cumulative_sum, histogram +from esmvalcore.preprocessor._regrid import ( extract_coordinate_points, extract_levels, extract_location, extract_point, regrid, ) -from ._rolling_window import rolling_window_statistics -from ._supplementary_vars import ( +from esmvalcore.preprocessor._rolling_window import rolling_window_statistics +from esmvalcore.preprocessor._supplementary_vars import ( add_supplementary_variables, remove_supplementary_variables, ) -from ._time import ( +from esmvalcore.preprocessor._time import ( annual_statistics, anomalies, climate_statistics, @@ -81,9 +83,9 @@ seasonal_statistics, timeseries_filter, ) -from ._trend import linear_trend, linear_trend_stderr -from ._units import accumulate_coordinate, convert_units -from ._volume import ( +from esmvalcore.preprocessor._trend import linear_trend, linear_trend_stderr +from esmvalcore.preprocessor._units import accumulate_coordinate, convert_units +from esmvalcore.preprocessor._volume import ( axis_statistics, depth_integration, extract_surface_from_atm, @@ -92,10 +94,10 @@ extract_volume, volume_statistics, ) -from ._weighting import weighting_landsea_fraction +from esmvalcore.preprocessor._weighting import weighting_landsea_fraction if TYPE_CHECKING: - from collections.abc import Callable, Iterable + from collections.abc import Callable, Iterable, Sequence from dask.delayed import Delayed diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index a27a809a21..d22eeb3a20 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -10,7 +10,7 @@ from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase from esmvalcore.preprocessor._units import convert_units -from esmvalcore.typing import Facets, FacetValue +from esmvalcore.typing import Facets logger = logging.getLogger(__name__) diff --git a/esmvalcore/preprocessor/_derive/_baseclass.py b/esmvalcore/preprocessor/_derive/_baseclass.py index b050921801..b8d8bc27da 100644 --- a/esmvalcore/preprocessor/_derive/_baseclass.py +++ b/esmvalcore/preprocessor/_derive/_baseclass.py @@ -4,7 +4,7 @@ from iris.cube import Cube, CubeList -from esmvalcore.typing import Facets, FacetValue +from esmvalcore.typing import Facets class DerivedVariableBase: diff --git a/esmvalcore/preprocessor/_derive/ohc.py b/esmvalcore/preprocessor/_derive/ohc.py index 1bd58c337f..d9105ffe52 100644 --- a/esmvalcore/preprocessor/_derive/ohc.py +++ b/esmvalcore/preprocessor/_derive/ohc.py @@ -5,7 +5,7 @@ from iris import Constraint from iris.cube import Cube, CubeList -from esmvalcore.typing import Facets, FacetValue +from esmvalcore.typing import Facets from ._baseclass import DerivedVariableBase diff --git a/esmvalcore/preprocessor/_derive/vegfrac.py b/esmvalcore/preprocessor/_derive/vegfrac.py index 007ab406f9..419ae3878c 100644 --- a/esmvalcore/preprocessor/_derive/vegfrac.py +++ b/esmvalcore/preprocessor/_derive/vegfrac.py @@ -6,7 +6,7 @@ from iris.cube import Cube, CubeList from esmvalcore.preprocessor._regrid import regrid -from esmvalcore.typing import Facets, FacetValue +from esmvalcore.typing import Facets from ._baseclass import DerivedVariableBase From 6be3169358f63a354a88b56b10fb82207ada2621 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:05:17 +0200 Subject: [PATCH 45/85] Use type aliases --- esmvalcore/_recipe/recipe.py | 64 ++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 09cb8d850c..f1c99dfe6e 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -64,9 +64,12 @@ logger = logging.getLogger(__name__) +Diagnostic = dict[str, Any] + PreprocessorSettings = dict[str, Any] -Diagnostic = dict[str, Any] +PreprocessorProfile = dict[str, dict[str, Any]] + DOWNLOAD_FILES = set() """Use a global variable to keep track of files that need to be downloaded.""" @@ -107,7 +110,7 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str: def _update_target_levels( dataset: Dataset, datasets: Sequence[Dataset], - settings: dict[str, Any], + settings: PreprocessorSettings, ) -> None: """Replace the target levels dataset name with a filename if needed.""" levels = settings.get("extract_levels", {}).get("levels") @@ -145,7 +148,7 @@ def _update_target_levels( def _update_target_grid( dataset: Dataset, datasets: Sequence[Dataset], - settings: dict[str, Any], + settings: PreprocessorSettings, ) -> None: """Replace the target grid dataset name with a filename if needed.""" grid = settings.get("regrid", {}).get("target_grid") @@ -195,7 +198,7 @@ def _select_dataset(dataset_name: str, datasets: Sequence[Dataset]) -> Dataset: def _limit_datasets( datasets: Sequence[Dataset], - profile: dict[str, Any], + profile: PreprocessorProfile, ) -> list[Dataset]: """Try to limit the number of datasets to max_datasets.""" max_datasets = datasets[0].session["max_datasets"] @@ -226,7 +229,7 @@ def _limit_datasets( return limited -def _get_default_settings(dataset: Dataset) -> dict[str, Any]: +def _get_default_settings(dataset: Dataset) -> PreprocessorSettings: """Get default preprocessor settings.""" session = dataset.session facets = dataset.facets @@ -257,7 +260,7 @@ def _get_default_settings(dataset: Dataset) -> dict[str, Any]: def _add_dataset_specific_settings( dataset: Dataset, - settings: dict[str, Any], + settings: PreprocessorSettings, ) -> None: """Add dataset-specific settings.""" project = dataset.facets["project"] @@ -287,7 +290,7 @@ def _add_dataset_specific_settings( def _exclude_dataset( - settings: dict[str, Any], + settings: PreprocessorSettings, facets: Facets, step: str, ) -> None: @@ -306,7 +309,7 @@ def _exclude_dataset( def _update_weighting_settings( - settings: dict[str, Any], + settings: PreprocessorSettings, facets: Facets, ) -> None: """Update settings for the weighting preprocessors.""" @@ -377,8 +380,8 @@ def _check_input_files(input_datasets: Iterable[Dataset]) -> set[str]: def _apply_preprocessor_profile( - settings: dict[str, Any], - profile_settings: dict[str, Any], + settings: PreprocessorSettings, + profile_settings: PreprocessorProfile, ) -> None: """Apply settings from preprocessor profile.""" profile_settings = deepcopy(profile_settings) @@ -396,7 +399,7 @@ def _apply_preprocessor_profile( def _get_common_attributes( products: set[PreprocessorFile], - settings: dict[str, Any], + settings: PreprocessorSettings, ) -> dict[str, Any]: """Get common attributes for the output products.""" attributes: dict[str, Any] = {} @@ -455,7 +458,7 @@ def _get_downstream_settings( step: str, order: tuple[str, ...], products: set[PreprocessorFile], -) -> dict[str, Any]: +) -> PreprocessorSettings: """Get downstream preprocessor settings shared between products.""" settings = {} remaining_steps = order[order.index(step) + 1 :] @@ -471,7 +474,7 @@ def _get_downstream_settings( def _update_multi_dataset_settings( facets: Facets, - settings: dict[str, Any], + settings: PreprocessorSettings, ) -> None: """Configure multi dataset statistics.""" for step in MULTI_MODEL_FUNCTIONS: @@ -500,7 +503,7 @@ def _update_multiproduct( order: tuple[str, ...], preproc_dir: Path, step: str, -) -> tuple[set[PreprocessorFile], dict[str, Any]]: +) -> tuple[set[PreprocessorFile], PreprocessorSettings]: """Return new products that are aggregated over multiple datasets. These new products will replace the original products at runtime. @@ -526,7 +529,7 @@ def _update_multiproduct( downstream_settings = _get_downstream_settings(step, order, multiproducts) - relevant_settings: dict[str, Any] = { + relevant_settings: PreprocessorSettings = { "output_products": defaultdict(dict), } # pass to ancestors @@ -570,7 +573,7 @@ def _update_multiproduct( def update_ancestors( ancestors: set[PreprocessorFile], step: str, - downstream_settings: dict[str, Any], + downstream_settings: PreprocessorSettings, ) -> None: """Retroactively add settings to ancestor products.""" for product in ancestors: @@ -580,7 +583,10 @@ def update_ancestors( settings[key] = value -def _update_extract_shape(settings: dict[str, Any], session: Session) -> None: +def _update_extract_shape( + settings: PreprocessorSettings, + session: Session, +) -> None: if "extract_shape" in settings: shapefile = settings["extract_shape"].get("shapefile") if shapefile: @@ -619,8 +625,8 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: def _get_preprocessor_products( datasets: list[Dataset], - profile: dict[str, Any], - order: Sequence[str, ...], + profile: PreprocessorProfile, + order: Sequence[str], name: str, ) -> set[PreprocessorFile]: """Get preprocessor product definitions for a set of datasets. @@ -764,7 +770,7 @@ def _set_start_end_year(product: PreprocessorFile) -> None: def _update_preproc_functions( - settings: dict[str, Any], + settings: PreprocessorSettings, dataset: Dataset, datasets: list[Dataset], missing_vars: set[str], @@ -800,7 +806,7 @@ def _update_preproc_functions( def _get_preprocessor_task( datasets: list[Dataset], - profiles: dict[str, Any], + profiles: PreprocessorProfile, task_name: str, ) -> PreprocessingTask: """Create preprocessor task(s) for a set of datasets.""" @@ -852,7 +858,9 @@ def _get_preprocessor_task( return task -def _extract_preprocessor_order(profile: dict[str, Any]) -> tuple[str, ...]: +def _extract_preprocessor_order( + profile: PreprocessorProfile, +) -> tuple[str, ...]: """Extract the order of the preprocessing steps from the profile.""" custom_order = profile.pop("custom_order", False) if not custom_order: @@ -937,7 +945,7 @@ def _log_recipe_errors(self, exc: RecipeError) -> None: ) @staticmethod - def _need_ncl(raw_diagnostics: dict[str, dict[str, Any]]) -> bool: + def _need_ncl(raw_diagnostics: Diagnostic) -> bool: if not raw_diagnostics: return False for diagnostic in raw_diagnostics.values(): @@ -960,8 +968,8 @@ def _initialize_provenance(self, raw_documentation: dict[str, Any]): def _initialize_diagnostics( self, - raw_diagnostics: dict[str, Any], - ) -> dict[str, Any]: + raw_diagnostics: Diagnostic, + ) -> Diagnostic: """Define diagnostics in recipe.""" logger.debug("Retrieving diagnostics from recipe") check.diagnostics(raw_diagnostics) @@ -969,7 +977,7 @@ def _initialize_diagnostics( diagnostics = {} for name, raw_diagnostic in raw_diagnostics.items(): - diagnostic: dict[str, Any] = {} + diagnostic: Diagnostic = {} diagnostic["name"] = name diagnostic["datasets"] = [ ds for ds in self.datasets if ds.facets["diagnostic"] == name @@ -1155,7 +1163,7 @@ def _create_diagnostic_tasks( def _create_preprocessor_tasks( self, diagnostic_name: str, - diagnostic: dict[str, Any], + diagnostic: Diagnostic, tasknames_to_run: set[str], any_diag_script_is_run: bool, ) -> tuple[list[BaseTask], list[RecipeError]]: @@ -1321,7 +1329,7 @@ def get_output(self) -> dict[str, Any]: Returns ------- - product_filenames : dict + dict Lists of products/attributes grouped by task. """ output: dict[str, Any] = {} From 5744b0d9b4a1774f6203881263a925253f742e66 Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Tue, 15 Jul 2025 19:09:19 +0200 Subject: [PATCH 46/85] Do not change minimal facets Co-authored-by: Bouwe Andela --- esmvalcore/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 64dd892096..dc211b6668 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -577,7 +577,7 @@ def __getitem__(self, key: str) -> FacetValue: def __setitem__(self, key: str, value: FacetValue) -> None: """Set a facet value.""" - self.set_facet(key, value, persist=False) + self.facets[key] = value def set_facet( self, From cbcf37b4c8e4d07fa26fd3e588783c22f99a9086 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:16:53 +0200 Subject: [PATCH 47/85] Used more type aliases --- esmvalcore/preprocessor/__init__.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 40b835b3ed..5ded4bbbd9 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -364,10 +364,10 @@ def _get_multi_model_settings( def _run_preproc_function( function: Callable, - items: Any, + items: PreprocessorItem | Iterable[PreprocessorItem], kwargs: Any, input_files: Sequence[File] | None = None, -) -> Any: +) -> PreprocessorItem | Iterable[PreprocessorItem]: """Run preprocessor function.""" kwargs_str = ",\n".join( [f"{k} = {pformat(v)}" for (k, v) in kwargs.items()], @@ -428,13 +428,13 @@ def _run_preproc_function( def preprocess( - items: Sequence[PreprocessorFile | Cube | str | Path], + items: Sequence[PreprocessorItem], step: str, input_files: list[File] | None = None, output_file: Path | None = None, debug: bool = False, **settings: Any, -) -> list[PreprocessorFile | Cube | str | Path]: +) -> list[PreprocessorItem]: """Run preprocessor.""" logger.debug("Running preprocessor step %s", step) function = globals()[step] @@ -656,6 +656,9 @@ def group(self, keys: list) -> str: return "_".join(identifier) +PreprocessorItem = PreprocessorFile | Cube | str | Path + + def _apply_multimodel( products: set[PreprocessorFile], step: str, From 14e8b5ed602dab8996f79c995a779ef103254fd6 Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Tue, 15 Jul 2025 19:19:37 +0200 Subject: [PATCH 48/85] Fix typo in func name Co-authored-by: Bouwe Andela --- tests/integration/recipe/test_recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index b127bcc3e7..5bd6ad47dc 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -1543,7 +1543,7 @@ def test_diagnostic_task_provenance( assert os.path.exists(prefix + ".xml") -def test_invalid_diagnostcic_ancestor( +def test_invalid_diagnostic_ancestor( tmp_path, patched_datafinder, session, From ecbecc6e3fec487238daeae25e3d65c09f0e37fd Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:32:05 +0200 Subject: [PATCH 49/85] Make mypy happy --- esmvalcore/_recipe/check.py | 4 ++-- esmvalcore/_recipe/recipe.py | 4 ++-- esmvalcore/_recipe/to_datasets.py | 4 ++-- esmvalcore/local.py | 4 ++-- esmvalcore/preprocessor/__init__.py | 8 ++++---- esmvalcore/preprocessor/_derive/__init__.py | 1 - esmvalcore/preprocessor/_derive/qep.py | 4 ++-- 7 files changed, 14 insertions(+), 15 deletions(-) diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index e79db4f2ee..a33868da74 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -36,7 +36,7 @@ from esmvalcore._task import TaskSet from esmvalcore.dataset import Dataset - from esmvalcore.typing import Facets, FacetValue + from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -467,7 +467,7 @@ def valid_time_selection(timerange: str) -> None: def differing_timeranges( - timeranges: set[FacetValue], + timeranges: set[str], required_vars: list[Facets], ) -> None: """Log error if required variables have differing timeranges.""" diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index f1c99dfe6e..392b130708 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -203,7 +203,7 @@ def _limit_datasets( """Try to limit the number of datasets to max_datasets.""" max_datasets = datasets[0].session["max_datasets"] if not max_datasets: - return datasets + return list(datasets) logger.info("Limiting the number of datasets to %s", max_datasets) @@ -626,7 +626,7 @@ def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: def _get_preprocessor_products( datasets: list[Dataset], profile: PreprocessorProfile, - order: Sequence[str], + order: tuple[str, ...], name: str, ) -> set[PreprocessorFile]: """Get preprocessor product definitions for a set of datasets. diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index d37b5a271f..7aab83719b 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -563,7 +563,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]: # Configure input datasets needed to derive variable datasets = [] - required_vars = get_required(facets["short_name"], facets["project"]) + required_vars = get_required(facets["short_name"], facets["project"]) # type: ignore # idea: add option to specify facets in list of dicts that is value of # 'derive' in the recipe and use that instead of get_required? for input_facets in required_vars: @@ -587,7 +587,7 @@ def _get_input_datasets(dataset: Dataset) -> list[Dataset]: timeranges: set[str] = set() for input_dataset in datasets: if "timerange" in input_dataset.facets: - timeranges.add(input_dataset.facets["timerange"]) + timeranges.add(input_dataset.facets["timerange"]) # type: ignore check.differing_timeranges(timeranges, required_vars) return datasets diff --git a/esmvalcore/local.py b/esmvalcore/local.py index d58d68a9b1..221e2796f5 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -338,9 +338,9 @@ def _select_files( timerange = str(timerange) if "*" in timerange: # TODO: support * combined with a period - return filenames + return list(filenames) - selection = [] + selection: list[LocalFile] = [] for filename in filenames: start: int | str diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 5ded4bbbd9..f043b7ef91 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -7,7 +7,7 @@ import logging from pathlib import Path from pprint import pformat -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypeAlias from iris.cube import Cube, CubeList @@ -364,10 +364,10 @@ def _get_multi_model_settings( def _run_preproc_function( function: Callable, - items: PreprocessorItem | Iterable[PreprocessorItem], + items: PreprocessorItem | Sequence[PreprocessorItem], kwargs: Any, input_files: Sequence[File] | None = None, -) -> PreprocessorItem | Iterable[PreprocessorItem]: +) -> PreprocessorItem | Sequence[PreprocessorItem]: """Run preprocessor function.""" kwargs_str = ",\n".join( [f"{k} = {pformat(v)}" for (k, v) in kwargs.items()], @@ -656,7 +656,7 @@ def group(self, keys: list) -> str: return "_".join(identifier) -PreprocessorItem = PreprocessorFile | Cube | str | Path +PreprocessorItem: TypeAlias = PreprocessorFile | Cube | str | Path def _apply_multimodel( diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index d22eeb3a20..cbf138e2d7 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -59,7 +59,6 @@ def get_required(short_name: str, project: str) -> list[Facets]: List of facets (including at least the key ``short_name``). """ - short_name = str(short_name) if short_name.lower() not in ALL_DERIVED_VARIABLES: msg = ( f"Cannot derive variable '{short_name}': no derivation script " diff --git a/esmvalcore/preprocessor/_derive/qep.py b/esmvalcore/preprocessor/_derive/qep.py index 19d677f618..9f684cde5d 100644 --- a/esmvalcore/preprocessor/_derive/qep.py +++ b/esmvalcore/preprocessor/_derive/qep.py @@ -3,7 +3,7 @@ from iris import Constraint from iris.cube import Cube, CubeList -from esmvalcore.typing import Facets, FacetValue +from esmvalcore.typing import Facets from ._baseclass import DerivedVariableBase @@ -12,7 +12,7 @@ class DerivedVariable(DerivedVariableBase): """Derivation of variable `qep`.""" @staticmethod - def required(project: FacetValue) -> list[Facets]: + def required(project: str) -> list[Facets]: """Declare the variables needed for derivation.""" return [ {"short_name": "evspsbl"}, From d7c73aa172b3a91d172080d60f55bb5467263531 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:42:51 +0200 Subject: [PATCH 50/85] Use type aliases in regrid.py --- esmvalcore/preprocessor/_regrid.py | 52 ++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py index 62b1b27e83..84ac2a4f4e 100644 --- a/esmvalcore/preprocessor/_regrid.py +++ b/esmvalcore/preprocessor/_regrid.py @@ -12,7 +12,7 @@ from copy import deepcopy from decimal import Decimal from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import dask.array as da import iris @@ -79,12 +79,23 @@ _LON_MAX = 360.0 _LON_RANGE = _LON_MAX - _LON_MIN +NamedPointInterpolationScheme = Literal[ + "linear", + "nearest", +] + # Supported point interpolation schemes. POINT_INTERPOLATION_SCHEMES = { "linear": Linear(extrapolation_mode="mask"), "nearest": Nearest(extrapolation_mode="mask"), } +NamedHorizontalScheme = Literal[ + "area_weighted", + "linear", + "nearest", +] + # Supported horizontal regridding schemes for regular grids (= rectilinear # grids; i.e., grids that can be described with 1D latitude and 1D longitude # coordinates orthogonal to each other) @@ -119,8 +130,15 @@ "nearest": UnstructuredNearest(), } +NamedVerticalScheme = Literal[ + "linear", + "nearest", + "linear_extrapolate", + "nearest_extrapolate", +] + # Supported vertical interpolation schemes. -VERTICAL_SCHEMES: tuple[str, ...] = ( +VERTICAL_SCHEMES: tuple[NamedVerticalScheme, ...] = ( "linear", "nearest", "linear_extrapolate", @@ -386,7 +404,11 @@ def add_bounds_from_step(coord: Coord, step: float) -> np.ndarray: return cube -def extract_location(cube: Cube, location: str, scheme: str) -> Cube: +def extract_location( + cube: Cube, + location: str, + scheme: NamedPointInterpolationScheme, +) -> Cube: """Extract a point using a location name, with interpolation. Extracts a single location point from a cube, according @@ -477,7 +499,7 @@ def extract_point( cube: Cube, latitude: ArrayLike, longitude: ArrayLike, - scheme: str, + scheme: NamedPointInterpolationScheme, ) -> Cube: """Extract a point, with interpolation. @@ -604,7 +626,11 @@ def _get_target_grid_cube( return target_grid_cube -def _load_scheme(src_cube: Cube, tgt_cube: Cube, scheme: str | dict): +def _load_scheme( + src_cube: Cube, + tgt_cube: Cube, + scheme: NamedHorizontalScheme | dict[str, Any], +): """Return scheme that can be used in :meth:`iris.cube.Cube.regrid`.""" loaded_scheme: Any = None @@ -637,7 +663,7 @@ def _load_scheme(src_cube: Cube, tgt_cube: Cube, scheme: str | dict): return loaded_scheme -def _load_generic_scheme(scheme: dict): +def _load_generic_scheme(scheme: dict[str, Any]): """Load generic regridding scheme.""" scheme = dict(scheme) # do not overwrite original scheme @@ -677,7 +703,7 @@ def _load_generic_scheme(scheme: dict): def _get_regridder( src_cube: Cube, tgt_cube: Cube, - scheme: str | dict, + scheme: NamedHorizontalScheme | dict, cache_weights: bool, ): """Get regridder to actually perform regridding. @@ -731,7 +757,7 @@ def _get_coord_key(src_cube: Cube, tgt_cube: Cube) -> tuple[ArrayLike, ...]: def _get_name_and_shape_key( src_cube: Cube, tgt_cube: Cube, - scheme: str | dict, + scheme: NamedHorizontalScheme | dict, ) -> tuple[str, tuple[int, ...]]: """Get dict key from scheme name and coordinate shapes.""" name = str(scheme) @@ -743,7 +769,7 @@ def _get_name_and_shape_key( def regrid( cube: Cube, target_grid: Cube | Dataset | Path | str | dict, - scheme: str | dict, + scheme: NamedHorizontalScheme | dict, lat_offset: bool = True, lon_offset: bool = True, cache_weights: bool = False, @@ -888,7 +914,7 @@ def regrid( # Load scheme and reuse existing regridder if possible if isinstance(scheme, str): - scheme = scheme.lower() + scheme = scheme.lower() # type: ignore regridder = _get_regridder(cube, target_grid_cube, scheme, cache_weights) # Rechunk and actually perform the regridding @@ -1181,7 +1207,7 @@ def _preserve_fx_vars(cube: iris.cube.Cube, result: iris.cube.Cube) -> None: add_ancillary_variable(result, ancillary_cube) -def parse_vertical_scheme(scheme: str) -> tuple[str, str]: +def parse_vertical_scheme(scheme: NamedVerticalScheme) -> tuple[str, str]: """Parse the scheme provided for level extraction. Parameters @@ -1224,7 +1250,7 @@ def parse_vertical_scheme(scheme: str) -> tuple[str, str]: def extract_levels( cube: iris.cube.Cube, levels: np.typing.ArrayLike | da.Array, - scheme: str, + scheme: NamedVerticalScheme, coordinate: str | None = None, rtol: float = 1e-7, atol: float | None = None, @@ -1422,7 +1448,7 @@ def get_reference_levels(dataset: Dataset) -> list[float]: def extract_coordinate_points( cube: Cube, definition: dict[str, ArrayLike], - scheme: str, + scheme: NamedPointInterpolationScheme, ) -> Cube: """Extract points from any coordinate with interpolation. From 69e05029bca9d467e4f6b401ef8e506e652df23e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:45:25 +0200 Subject: [PATCH 51/85] Valid return type in docstring --- esmvalcore/preprocessor/_regrid.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py index 84ac2a4f4e..556949810d 100644 --- a/esmvalcore/preprocessor/_regrid.py +++ b/esmvalcore/preprocessor/_regrid.py @@ -342,10 +342,9 @@ def _spec_to_latlonvals( Returns ------- - xvals: np.array - List of longitudes - yvals: np.array - List of latitudes + tuple[np.ndarray, np.ndarray] + Longitudes, Latitudes. + """ if step_latitude == 0: msg = f"Latitude step cannot be 0, got step_latitude={step_latitude}." From 6eedca2cef8e2dfb157ffff3e63b51c6251043cb Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:46:20 +0200 Subject: [PATCH 52/85] Avoid Coord --- esmvalcore/preprocessor/_regrid.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/esmvalcore/preprocessor/_regrid.py b/esmvalcore/preprocessor/_regrid.py index 556949810d..262283fb1f 100644 --- a/esmvalcore/preprocessor/_regrid.py +++ b/esmvalcore/preprocessor/_regrid.py @@ -50,7 +50,6 @@ if TYPE_CHECKING: from collections.abc import Iterable - from iris.coords import Coord from numpy.typing import ArrayLike from esmvalcore.dataset import Dataset @@ -391,7 +390,10 @@ def _regional_stock_cube(spec: dict[str, Any]) -> Cube: circular=True, ) - def add_bounds_from_step(coord: Coord, step: float) -> np.ndarray: + def add_bounds_from_step( + coord: iris.coords.DimCoord | iris.coords.AuxCoord, + step: float, + ) -> np.ndarray: """Calculate bounds from the given step.""" bound = step / 2 points = coord.points From 62c1996a634373a158bcacb0d84c394a5073f557 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:49:44 +0200 Subject: [PATCH 53/85] Correct type hint --- esmvalcore/preprocessor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index f043b7ef91..86552814a5 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -568,7 +568,7 @@ def apply(self, step: str, debug: bool = False) -> None: ) @property - def cubes(self) -> CubeList: + def cubes(self) -> list[Cube]: """Cubes.""" if self._cubes is None: self._cubes = [ds.load() for ds in self.datasets] # type: ignore From 8f2f1795c1a577ee5399ceb894371eee4b3a3662 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:53:35 +0200 Subject: [PATCH 54/85] Assign new variable for new type --- esmvalcore/local.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 221e2796f5..cf71097439 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -343,16 +343,12 @@ def _select_files( selection: list[LocalFile] = [] for filename in filenames: - start: int | str - end: int | str - start_date: int | str - end_date: int | str start_date, end_date = _parse_period(timerange) start, end = _get_start_end_date(filename) - start_date, end = _truncate_dates(start_date, end) - end_date, start = _truncate_dates(end_date, start) - if start <= end_date and end >= start_date: + start_date_int, end_int = _truncate_dates(start_date, end) + end_date_int, start_int = _truncate_dates(end_date, start) + if start_int <= end_date_int and end_int >= start_date_int: selection.append(filename) return selection From 7bc1bee8c02cea833954b01ba96c28cab0cdc3dc Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Tue, 15 Jul 2025 19:54:46 +0200 Subject: [PATCH 55/85] Raise error for invalid type Co-authored-by: Bouwe Andela --- esmvalcore/local.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index cf71097439..967caa47b8 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -252,7 +252,9 @@ def _parse_period(timerange: FacetValue) -> tuple[str, str]: reference point in order to compute the start and end dates needed for file selection. """ - timerange = str(timerange) + if not isinstance(timerange, str): + msg = f"`timerange` should be a `str`, got '{type(timerange)}'" + raise TypeError(msg) start_date: str | None = None end_date: str | None = None time_format = None From 62067fca3242c2d16410d9e8d406d6c09ae4ba2f Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 19:58:47 +0200 Subject: [PATCH 56/85] Fail if invalid types given --- esmvalcore/local.py | 6 ++++-- tests/integration/test_local.py | 7 +++++++ tests/unit/local/test_select_files.py | 6 ++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 967caa47b8..60d56d6a89 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -253,7 +253,7 @@ def _parse_period(timerange: FacetValue) -> tuple[str, str]: for file selection. """ if not isinstance(timerange, str): - msg = f"`timerange` should be a `str`, got '{type(timerange)}'" + msg = f"`timerange` should be a `str`, got {type(timerange)}" raise TypeError(msg) start_date: str | None = None end_date: str | None = None @@ -337,7 +337,9 @@ def _select_files( Otherwise, the file selection occurs taking into account the time resolution of the file. """ - timerange = str(timerange) + if not isinstance(timerange, str): + msg = f"`timerange` should be a `str`, got {type(timerange)}" + raise TypeError(msg) if "*" in timerange: # TODO: support * combined with a period return list(filenames) diff --git a/tests/integration/test_local.py b/tests/integration/test_local.py index e2dae85dff..633d7b45da 100644 --- a/tests/integration/test_local.py +++ b/tests/integration/test_local.py @@ -11,6 +11,7 @@ from esmvalcore.local import ( LocalFile, _get_output_file, + _parse_period, _select_drs, find_files, ) @@ -138,3 +139,9 @@ def test_select_invalid_drs_structure(): ) with pytest.raises(KeyError, match=msg): _select_drs("input_dir", "CMIP6", "_INVALID_STRUCTURE_") + + +def test_parse_period_invalid_timerange_type(): + msg = r"`timerange` should be a `str`, got " + with pytest.raises(TypeError, match=msg): + _parse_period(1) diff --git a/tests/unit/local/test_select_files.py b/tests/unit/local/test_select_files.py index 377d05421c..7ecc571ab2 100644 --- a/tests/unit/local/test_select_files.py +++ b/tests/unit/local/test_select_files.py @@ -170,3 +170,9 @@ def test_select_files_varying_format(): assert result_yearly == files assert result_monthly == files[0:2] assert result_daily == [files[0]] + + +def test_select_files_invalid_timerange_type(): + msg = r"`timerange` should be a `str`, got " + with pytest.raises(TypeError, match=msg): + _select_files([], 1) From b12df84927837228dd0acf482898d4f9cbb4c903 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 20:01:49 +0200 Subject: [PATCH 57/85] Restore _pattern_filter --- esmvalcore/dataset.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index dc211b6668..bc5998d58a 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -666,16 +666,9 @@ def augment_facets(self) -> None: supplementary._augment_facets() # noqa: SLF001 @staticmethod - def _pattern_filter( - patterns: Iterable[FacetValue], - name: FacetValue, - ) -> list[str]: + def _pattern_filter(patterns: Iterable[str], name) -> list[str]: """Get the subset of the list `patterns` that `name` matches.""" - return [ - str(pat) - for pat in patterns - if fnmatch.fnmatchcase(str(name), str(pat)) - ] + return [pat for pat in patterns if fnmatch.fnmatchcase(name, pat)] def _get_extra_facets(self) -> dict[str, Any]: """Get extra facets of dataset.""" From 22ab6e72a7404f697d0574d7471193fca7ba678e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 20:12:47 +0200 Subject: [PATCH 58/85] Better _special_name_to_dataset --- esmvalcore/_recipe/recipe.py | 10 ++++++++++ tests/unit/recipe/test_recipe.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 392b130708..027b2472dd 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -102,6 +102,16 @@ def _special_name_to_dataset(facets: Facets, special_name: str) -> str: ) ) raise RecipeError(msg) + + if not isinstance(facets[special_name], str): + msg = ( + f"Preprocessor '{facets['preprocessor']}' uses " + f"'{special_name}', but '{special_name}' is not a `str` for " + f"variable '{facets['variable_group']}' of diagnostic " + f"'{facets['diagnostic']}', got '{facets[special_name]}' " + f"({type(facets[special_name])})" + ) + raise RecipeError(msg) special_name = str(facets[special_name]) return special_name diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 06299781ad..a3678924e8 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -987,3 +987,19 @@ def test_update_extract_shape_rel_shapefile(shapefile, session, tmp_path): / "ar6.shp" ) assert settings["extract_shape"]["shapefile"] == ar6_file + + +def test_special_name_to_dataset_invalid_special_name_type(): + facets = { + "preprocessor": "preproc", + "variable_group": "var", + "diagnostic": "diag", + "reference_dataset": 1, + } + msg = ( + r"Preprocessor 'preproc' uses 'reference_dataset', but " + r"'reference_dataset' is not a `str` for variable 'var' of diagnostic " + r"'diag', got '1' \(\)" + ) + with pytest.raises(RecipeError, match=msg): + _recipe._special_name_to_dataset(facets, "reference_dataset") From 36724efd0f77a503329e90fbeb352ac0db4bc5b3 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 20:14:18 +0200 Subject: [PATCH 59/85] Do not cast to str --- esmvalcore/_recipe/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 027b2472dd..64b3c1c5bb 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -233,7 +233,7 @@ def _limit_datasets( logger.info( "Only considering %s", - ", ".join(str(d.facets["alias"]) for d in limited), + ", ".join(d.facets["alias"] for d in limited), # type: ignore ) return limited From 6ad2fefaae5db5e30cf61ddd22901ac98fb386da Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Jul 2025 20:18:05 +0200 Subject: [PATCH 60/85] Use int variables --- esmvalcore/_recipe/recipe.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 64b3c1c5bb..c749f4aff1 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -427,32 +427,31 @@ def _get_common_attributes( if "timerange" not in product.attributes: continue timerange = product.attributes["timerange"] - start: int | str - end: int | str start, end = _parse_period(timerange) if "timerange" not in attributes: attributes["timerange"] = _dates_to_timerange(start, end) else: - start_date: int | str - end_date: int | str start_date, end_date = _parse_period(attributes["timerange"]) - start_date, start = _truncate_dates(start_date, start) - end_date, end = _truncate_dates(end_date, end) + start_date_int, start_int = _truncate_dates(start_date, start) + end_date_int, end_int = _truncate_dates(end_date, end) # If "span=overlap", always use the latest start_date and the # earliest end_date if span == "overlap": - start_date = max([start, start_date]) - end_date = min([end, end_date]) + start_date_int = max([start_int, start_date_int]) + end_date_int = min([end_int, end_date_int]) # If "span=full", always use the earliest start_date and the latest # end_date. Note: span can only take the values "overlap" or "full" # (this is checked earlier). else: - start_date = min([start, start_date]) - end_date = max([end, end_date]) + start_date_int = min([start_int, start_date_int]) + end_date_int = max([end_int, end_date_int]) - attributes["timerange"] = _dates_to_timerange(start_date, end_date) + attributes["timerange"] = _dates_to_timerange( + start_date_int, + end_date_int, + ) # Ensure that attributes start_year and end_year are always available if at # least one of the input datasets defines it From 74983d52ea3d6c59a3dfc81283c2939aa2f66d7f Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 16 Jul 2025 17:48:06 +0200 Subject: [PATCH 61/85] Add doc --- esmvalcore/dataset.py | 9 ++++-- esmvalcore/preprocessor/_derive/__init__.py | 36 +++++++++++++++++++-- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 6fd81d9147..c8969c0436 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -103,10 +103,12 @@ class Dataset: Attributes ---------- - supplementaries : list[Dataset] + supplementaries: list[Dataset] List of supplementary datasets. facets: :obj:`esmvalcore.typing.Facets` Facets describing the dataset. + input_datasets: list[Dataset] + Input datasets. """ _SUMMARY_FACETS: tuple[str, ...] = ( @@ -406,7 +408,6 @@ def from_files(self) -> Iterator[Dataset]: The facet values for local files are retrieved from the directory tree where the directories represent the facets values. - Reading facet values from file names is not yet supported. See :ref:`CMOR-DRS` for more information on this kind of file organization. @@ -424,6 +425,10 @@ def from_files(self) -> Iterator[Dataset]: Supplementary datasets will in inherit the facet values from the main dataset for those facets listed in :obj:`INHERITED_FACETS`. + This also works for :ref:`derived variables `. The + input datasets that are can be used for derivation are available via + :attr:`Dataset.input_datasets`. + Examples -------- See :ref:`/notebooks/discovering-data.ipynb` for example use cases. diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index cbf138e2d7..290e29c84e 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -2,6 +2,7 @@ import importlib import logging +from collections.abc import Sequence from copy import deepcopy from pathlib import Path @@ -70,7 +71,7 @@ def get_required(short_name: str, project: str) -> list[Facets]: def derive( - cubes: CubeList, + cubes: Sequence[Cube], short_name: str, long_name: str, units: str | Unit, @@ -81,8 +82,7 @@ def derive( Parameters ---------- cubes: - Includes all the needed variables for derivation defined in - :func:`get_required`. + Includes all the needed variables for derivation. short_name: short_name long_name: @@ -96,6 +96,36 @@ def derive( ------- iris.cube.Cube The new derived variable. + + Examples + -------- + Input variables for derivation can be obtained via + :attr:`Dataset.input_datasets`. + + To derive the longwave cloud radiative effect (LWCRE) for the model CESM2, + you can use: + + >>> from esmvalcore.dataset import Dataset + from esmv>>> from esmvalcore.preprocessor import derive + >>> dataset = Dataset( + ... project="CMIP6", + ... dataset="CESM2", + ... exp="historical", + ... ensemble="r1i1p1f1", + ... grid="gn", + ... timerange="2000/2014", + ... short_name="lwcre", + ... mip="Amon", + ... derive=True, + ... ) + >>> cubes = [d.load() for d in dataset.input_datasets] + >>> cube = derive( + ... cubes, + ... short_name="lwcre", + ... long_name="TOA Longwave Cloud Radiative Effect", + ... units="W m-2", + ... ) # doctest: +SKIP + """ if short_name == cubes[0].var_name: return cubes[0] From acaf9fdb75bf734f32dc51f56555418a8255db65 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 16 Jul 2025 18:08:41 +0200 Subject: [PATCH 62/85] Expand notebook --- notebooks/discovering-data.ipynb | 243 ++++++++++++++++++++++++++++++- 1 file changed, 236 insertions(+), 7 deletions(-) diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index d6c9001ef2..9078c2523e 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "f0ccfe7f-c535-4606-99ce-be24960aece1", "metadata": {}, "outputs": [], @@ -89,7 +89,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 778 datasets, showing the first 10:\n" + "Found 727 datasets, showing the first 10:\n" ] }, { @@ -168,20 +168,20 @@ " 'grid': 'gn',\n", " 'institute': 'AWI'},\n", " Dataset:\n", - " {'dataset': 'BCC-CSM2-MR',\n", + " {'dataset': 'AWI-ESM-1-REcoM',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", - " 'institute': 'BCC'},\n", + " 'institute': 'AWI'},\n", " Dataset:\n", " {'dataset': 'BCC-CSM2-MR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", - " 'ensemble': 'r2i1p1f1',\n", + " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", " 'institute': 'BCC'}]" @@ -253,7 +253,7 @@ { "data": { "text/plain": [ - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]" + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]" ] }, "execution_count": 6, @@ -282,7 +282,7 @@ { "data": { "text/plain": [ - "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" + "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" ] }, "execution_count": 7, @@ -312,6 +312,235 @@ "source": [ "download(dataset.files, CFG[\"download_dir\"])" ] + }, + { + "cell_type": "markdown", + "id": "d3006d90", + "metadata": {}, + "source": [ + "`Dataset.from_files` can also handle derived variables properly:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b75314e3", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_template = Dataset(\n", + " short_name=\"lwcre\",\n", + " mip=\"Amon\",\n", + " project=\"CMIP6\",\n", + " exp=\"historical\",\n", + " dataset=\"*\",\n", + " institute=\"*\",\n", + " ensemble=\"r1i1p1f1\",\n", + " grid=\"gn\",\n", + " derive=True,\n", + " force_derivation=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "18e3a0b7", + "metadata": {}, + "source": [ + "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b87c247f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 36 datasets, showing the first 10:\n" + ] + }, + { + "data": { + "text/plain": [ + "[Dataset:\n", + " {'dataset': 'TaiESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AS-RCEC'},\n", + " Dataset:\n", + " {'dataset': 'AWI-CM-1-1-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'AWI-ESM-1-1-LR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'AWI-ESM-1-REcoM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'AWI'},\n", + " Dataset:\n", + " {'dataset': 'BCC-CSM2-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'BCC-ESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'CAMS-CSM1-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAMS'},\n", + " Dataset:\n", + " {'dataset': 'CAS-ESM2-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'FGOALS-g3',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'IITM-ESM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CCCR-IITM'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datasets = list(dataset_template.from_files())\n", + "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n", + "datasets[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "f00a886f", + "metadata": {}, + "source": [ + "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets will be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c5edfa65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = datasets[0]\n", + "dataset.files" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "97cdf12d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rlut\n", + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n", + "rlutcs\n", + "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n" + ] + } + ], + "source": [ + "for d in dataset.input_datasets:\n", + " print(d[\"short_name\"])\n", + " print(d.files)" + ] } ], "metadata": { From f6e531b867e9cae9b9d8a1d467b366f99de7412e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 16 Jul 2025 18:13:36 +0200 Subject: [PATCH 63/85] Fix doc build --- esmvalcore/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index c8969c0436..291a0b16df 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -107,8 +107,6 @@ class Dataset: List of supplementary datasets. facets: :obj:`esmvalcore.typing.Facets` Facets describing the dataset. - input_datasets: list[Dataset] - Input datasets. """ _SUMMARY_FACETS: tuple[str, ...] = ( From 30b6f537881190381161fdc98f5708adfd82b9b2 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 16 Jul 2025 18:24:32 +0200 Subject: [PATCH 64/85] Update doc --- esmvalcore/dataset.py | 4 +++- notebooks/discovering-data.ipynb | 18 +++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 291a0b16df..5fc24adb53 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -231,6 +231,8 @@ def input_datasets(self) -> list[Dataset]: Derivation is necessary if the facet ``force_derivation=True`` is set or no files for the dataset itself are available. + See also :func:`esmvalcore.preprocessor.derive` for an example usage. + """ if self._input_datasets: return self._input_datasets @@ -424,7 +426,7 @@ def from_files(self) -> Iterator[Dataset]: dataset for those facets listed in :obj:`INHERITED_FACETS`. This also works for :ref:`derived variables `. The - input datasets that are can be used for derivation are available via + input datasets that are necessary for derivation can be accessed via :attr:`Dataset.input_datasets`. Examples diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index 9078c2523e..581e8ca249 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -342,14 +342,6 @@ ")" ] }, - { - "cell_type": "markdown", - "id": "18e3a0b7", - "metadata": {}, - "source": [ - "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned." - ] - }, { "cell_type": "code", "execution_count": 10, @@ -489,12 +481,20 @@ "datasets[:10]" ] }, + { + "cell_type": "markdown", + "id": "18e3a0b7", + "metadata": {}, + "source": [ + "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned." + ] + }, { "cell_type": "markdown", "id": "f00a886f", "metadata": {}, "source": [ - "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets will be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:" + "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:" ] }, { From 1cdfef2f9e04882ea76c8c032aef4bce2e300e61 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 16 Jul 2025 18:27:14 +0200 Subject: [PATCH 65/85] Better derivation example --- esmvalcore/preprocessor/_derive/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index 290e29c84e..5c14367dd6 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -100,13 +100,13 @@ def derive( Examples -------- Input variables for derivation can be obtained via - :attr:`Dataset.input_datasets`. + :attr:`esmvalcore.dataset.Dataset.input_datasets`. - To derive the longwave cloud radiative effect (LWCRE) for the model CESM2, - you can use: + For example, to derive the longwave cloud radiative effect (LWCRE) for the + model CESM2, you can use: >>> from esmvalcore.dataset import Dataset - from esmv>>> from esmvalcore.preprocessor import derive + >>> from esmvalcore.preprocessor import derive >>> dataset = Dataset( ... project="CMIP6", ... dataset="CESM2", @@ -124,7 +124,9 @@ def derive( ... short_name="lwcre", ... long_name="TOA Longwave Cloud Radiative Effect", ... units="W m-2", - ... ) # doctest: +SKIP + ... ) + >>> print(cube.var_name) + lwcre # doctest: +SKIP """ if short_name == cubes[0].var_name: From 7ec32814f5fffc070f47f63005b5042d7aa2b076 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 9 Jan 2026 09:59:39 +0100 Subject: [PATCH 66/85] Load default data sources in global session fixture and fix first tests --- tests/conftest.py | 40 +++++++++++++++++++++++++++++- tests/unit/test_dataset.py | 50 +++----------------------------------- 2 files changed, 43 insertions(+), 47 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 46cabf58f9..3c19e4c4df 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib import warnings from copy import deepcopy from functools import lru_cache @@ -7,6 +8,7 @@ import numpy as np import pytest +import yaml from cf_units import Unit from iris.coords import ( AncillaryVariable, @@ -17,6 +19,7 @@ ) from iris.cube import Cube +import esmvalcore from esmvalcore.config import CFG, Config if TYPE_CHECKING: @@ -55,6 +58,33 @@ def ignore_existing_user_config( monkeypatch.setattr(CFG, "_mapping", cfg_default._mapping) +@lru_cache +def _load_default_data_sources() -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load default data sources for local users.""" + cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { + "projects": {}, + } + for file in ( + "data-local.yml", + "data-local-esmvaltool.yml", + "data-native-cesm.yml", + "data-native-emac.yml", + "data-native-icon.yml", + "data-native-ipslcm.yml", + ): + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / file, + ) as config_file: + content = config_file.read_text(encoding="utf-8") + cfg["projects"].update(yaml.safe_load(content)["projects"]) + return cfg + + @pytest.fixture def session( tmp_path: Path, @@ -63,7 +93,15 @@ def session( ) -> Session: """Session object with default settings.""" monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output") - return CFG.start_session("recipe_test") + session = CFG.start_session("recipe_test") + projects = _load_default_data_sources()["projects"] + for project in projects: + print(project) + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(tmp_path) + session["projects"][project]["data"] = data_sources + return session @pytest.fixture diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 071480d1fb..b6aa3a8bab 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,16 +1,13 @@ from __future__ import annotations -import importlib.resources import logging import textwrap from collections import defaultdict -from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING import pyesgf import pytest -import yaml import esmvalcore.dataset import esmvalcore.io.esgf @@ -25,45 +22,6 @@ from esmvalcore.typing import Facets -@lru_cache -def _load_default_data_sources() -> dict[ - str, - dict[str, dict[str, dict[str, dict[str, str]]]], -]: - """Load default data sources for local users.""" - cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { - "projects": {}, - } - for file in ( - "data-local.yml", - "data-local-esmvaltool.yml", - "data-native-cesm.yml", - "data-native-emac.yml", - "data-native-icon.yml", - "data-native-ipslcm.yml", - ): - with importlib.resources.as_file( - importlib.resources.files(esmvalcore.config) - / "configurations" - / file, - ) as config_file: - content = config_file.read_text(encoding="utf-8") - cfg["projects"].update(yaml.safe_load(content)["projects"]) - return cfg - - -@pytest.fixture -def session(tmp_path: Path, session: Session) -> Session: - """Session fixture with default local data sources.""" - projects = _load_default_data_sources()["projects"] - for project in projects: - data_sources = projects[project]["data"] - for data_source in data_sources.values(): - data_source["rootpath"] = str(tmp_path) - session["projects"][project]["data"] = data_sources - return session - - def test_repr(): ds = Dataset(short_name="tas", dataset="dataset1") @@ -2267,7 +2225,7 @@ def test_set_version_non_derived_var(): assert dataset.supplementaries[0].facets["version"] == "v3" -def test_set_version_derived_var(monkeypatch): +def test_set_version_derived_var(monkeypatch, session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.add_supplementary(short_name="areacella") dataset.files = [] @@ -2868,7 +2826,7 @@ def test_derivation_necessary_no_force_derivation_no_files( assert dataset._derivation_necessary() is True -def test_derivation_necessary_no_force_derivation_no_files_glob(): +def test_derivation_necessary_no_force_derivation_no_files_glob(session): dataset = Dataset( **{**OBS6_SAT_FACETS, "timerange": "*"}, short_name="lwcre", @@ -2952,7 +2910,7 @@ def test_add_derived_supplementary_to_derived(): assert dataset.supplementaries[0] == expected_supplementary -def test_input_datasets_derivation(): +def test_input_datasets_derivation(session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.add_supplementary(short_name="pr") @@ -3008,7 +2966,7 @@ def test_input_datasets_no_force_derivation(tmp_path, session): assert dataset.input_datasets == [dataset] -def test_input_datasets_no_derivation_available(): +def test_input_datasets_no_derivation_available(session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True) msg = r"Cannot derive variable 'tas': no derivation script available" From 2bfc1fa5a5b5a028e1af908e8ee0ecf65b37b9d6 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 9 Jan 2026 11:08:40 +0100 Subject: [PATCH 67/85] Fixed recipe test --- tests/integration/recipe/test_recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 5d12e15219..7ebddfdfde 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -1707,7 +1707,7 @@ def test_alias_generation(tmp_path, patched_datafinder, session): # noqa: C901, assert dataset["alias"] == "CORDEX_ICHEC-EC-EARTH" else: assert dataset["alias"] == "CORDEX_MIROC-MIROC5" - elif dataset["version"] == 1: + elif dataset["version"] == "1": assert dataset["alias"] == "OBS_1" else: assert dataset["alias"] == "OBS_2" From f0f2b6e61ca90f49a035fdc21c193077be03362f Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 9 Jan 2026 12:47:03 +0100 Subject: [PATCH 68/85] We don't need to raise an error if no files are found when updating time ranges --- esmvalcore/dataset.py | 5 +++-- tests/unit/test_dataset.py | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index f147e54922..8ebee6aac8 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -1112,8 +1112,9 @@ def _update_timerange(self) -> None: dataset = self.copy() dataset.facets.pop("timerange") dataset.supplementaries = [] - check.data_availability(dataset) - if all("timerange" in f.facets for f in dataset.files): + if dataset.files and all( + "timerange" in f.facets for f in dataset.files + ): # "timerange" can only be reliably computed when all DataElements # provide it. intervals = [ diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index b6aa3a8bab..524f966076 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -2333,9 +2333,8 @@ def test_update_timerange_no_files(session, search_data): } dataset = Dataset(**variable) dataset.files = [] - msg = r"Missing data for Dataset: tas, Amon, CMIP6, HadGEM3-GC31-LL.*" - with pytest.raises(InputFilesNotFound, match=msg): - dataset._update_timerange() + dataset._update_timerange() + assert "timerange" not in dataset.facets def test_update_timerange_typeerror(): From cffdeeac1be3ddb2e2e0a58af03cf74a838f7f32 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 9 Jan 2026 15:08:34 +0100 Subject: [PATCH 69/85] Fixed existing tests and add one for data with unavailable years --- esmvalcore/dataset.py | 2 +- tests/unit/test_dataset.py | 95 ++++++++++++++++++++++++++++++++------ 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 8ebee6aac8..f2fd2e7b23 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -325,7 +325,7 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 all_datasets[-1].append((updated_facets, new_ds)) # Only consider those datasets that contain all input variables - # necessary for derivation + # necessary for derivation with the same facets (e.g., skip those where provided timeranges are different) for updated_facets, new_ds in all_datasets[0]: other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] if all(updated_facets in facets for facets in other_facets): diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 524f966076..5a14f38d5f 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1253,6 +1253,18 @@ def rlut_file(tmp_path): return rlut +@pytest.fixture +def rlut_file_future(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_2100-2101.nc", + ) + rlut.touch() + return rlut + + @pytest.fixture def rlut_file_ground(tmp_path): input_dir = tmp_path / "Tier2" / "SAT" @@ -1332,7 +1344,7 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session): expected_input_dataset.session = session assert datasets[0].input_datasets == [expected_input_dataset] - assert expected_input_dataset.files == [lwcre_file] + assert datasets[0].input_datasets[0].files == [lwcre_file] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1418,8 +1430,8 @@ def test_from_files_with_derived_no_derivation_glob( strict=True, ): assert dataset.input_datasets == [expected] - assert expected_input_datasets[0].files == [lwcre_file_ground] - assert expected_input_datasets[1].files == [lwcre_file] + assert datasets[0].input_datasets[0].files == [lwcre_file_ground] + assert datasets[1].input_datasets[0].files == [lwcre_file] def test_from_files_with_derived(rlut_file, rlutcs_file, session): @@ -1466,8 +1478,65 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): expected_ds.session = session assert datasets[0].input_datasets == expected_input_datasets - assert expected_input_datasets[0].files == [rlut_file] - assert expected_input_datasets[1].files == [rlutcs_file] + assert dataset.input_datasets[0].files == [rlut_file] + assert dataset.input_datasets[1].files == [rlutcs_file] + + +def test_from_files_with_derived_unavailable_years( + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="lwcre", + derive=True, + ) + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="lwcre", + derive=True, + ) + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + + expected_input_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_input_datasets: + expected_ds.session = session + + assert datasets[0].input_datasets == expected_input_datasets + assert dataset.input_datasets[0].files == [] + assert dataset.input_datasets[1].files == [] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1528,8 +1597,8 @@ def test_from_files_with_derived_glob( expected_ds.session = session assert datasets[0].input_datasets == expected_input_datasets - assert expected_input_datasets[0].files == [rlut_file] - assert expected_input_datasets[1].files == [rlutcs_file] + assert datasets[0].input_datasets[0].files == [rlut_file] + assert datasets[0].input_datasets[1].files == [rlutcs_file] log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] msg = "Not all necessary input variables to derive 'lwcre' are available" @@ -1677,8 +1746,8 @@ def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 strict=True, ): assert dataset.input_datasets == [expected] - assert expected_input_datasets[0].files == [lwcre_file_ground] - assert expected_input_datasets[1].files == [lwcre_file] + assert datasets[0].input_datasets[0].files == [lwcre_file_ground] + assert datasets[1].input_datasets[0].files == [lwcre_file] def test_from_files_with_derived_force_derivation( @@ -1742,8 +1811,8 @@ def test_from_files_with_derived_force_derivation( expected_ds.session = session assert datasets[0].input_datasets == expected_input_datasets - assert expected_input_datasets[0].files == [rlut_file] - assert expected_input_datasets[1].files == [rlutcs_file] + assert dataset.input_datasets[0].files == [rlut_file] + assert dataset.input_datasets[1].files == [rlutcs_file] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1814,8 +1883,8 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 expected_ds.session = session assert datasets[0].input_datasets == expected_input_datasets - assert expected_input_datasets[0].files == [rlut_file] - assert expected_input_datasets[1].files == [rlutcs_file] + assert datasets[0].input_datasets[0].files == [rlut_file] + assert datasets[0].input_datasets[1].files == [rlutcs_file] log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] msg = "Not all necessary input variables to derive 'lwcre' are available" From dec25bc6b530bc3409eb3d887357840ecd7c6441 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 9 Jan 2026 16:28:00 +0100 Subject: [PATCH 70/85] Use static methods to make sure that original Dataset instance is not overwritten --- esmvalcore/dataset.py | 59 +++++++++++++++++++++----------------- tests/unit/test_dataset.py | 2 +- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index f2fd2e7b23..3b8ef0092b 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -235,12 +235,11 @@ def input_datasets(self) -> list[Dataset]: return self._input_datasets if not self._derivation_necessary(): - input_datasets = [self] + self._input_datasets = [self] else: - input_datasets = self._get_input_datasets() + self._input_datasets = self._get_input_datasets() - self._input_datasets = input_datasets - return input_datasets + return self._input_datasets @staticmethod def _file_to_dataset( @@ -279,10 +278,12 @@ def _file_to_dataset( return new_dataset - def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 + @staticmethod + def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # noqa: C901 """Yield datasets based on the available files. - This function requires that self.facets['mip'] is not a glob pattern. + This function requires that dataset.facets['mip'] is not a glob + pattern. Does take variable derivation into account, i.e., datasets available through variable derivation are returned. @@ -291,41 +292,42 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 datasets_found = False # If no forced derivation is requested, search for datasets based on - # files from self - if not self._is_force_derived(): - for dataset in self._get_available_datasets(self): + # files from dataset + if not dataset._is_force_derived(): + for available_ds in Dataset._get_available_datasets(dataset): datasets_found = True - yield dataset + yield available_ds # For variables that cannot be derived, we are done here - if not self._is_derived(): + if not dataset._is_derived(): return # If forced derivation is requested or no datasets based on files from - # self have been found, search for datasets based on files from input - # datasets - if self._is_force_derived() or not datasets_found: + # dataset have been found, search for datasets based on files from + # input datasets + if dataset._is_force_derived() or not datasets_found: all_datasets: list[list[tuple[dict, Dataset]]] = [] - for input_dataset in self._get_input_datasets(): + for input_dataset in dataset._get_input_datasets(): all_datasets.append([]) - for expanded_ds in self._get_available_datasets( + for expanded_ds in Dataset._get_available_datasets( input_dataset, ): updated_facets = {} - for key, value in self.facets.items(): + for key, value in dataset.facets.items(): if _isglob(value): if key in expanded_ds.facets and not _isglob( expanded_ds[key], ): updated_facets[key] = expanded_ds.facets[key] - new_ds = self.copy() + new_ds = dataset.copy() new_ds.facets.update(updated_facets) - new_ds.supplementaries = self.supplementaries + new_ds.supplementaries = dataset.supplementaries all_datasets[-1].append((updated_facets, new_ds)) - # Only consider those datasets that contain all input variables - # necessary for derivation with the same facets (e.g., skip those where provided timeranges are different) + # Only consider those datasets that contain all required variables + # with identical facets (e.g., skip those with different + # timeranges) for updated_facets, new_ds in all_datasets[0]: other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] if all(updated_facets in facets for facets in other_facets): @@ -334,12 +336,13 @@ def _get_all_available_datasets(self) -> Iterator[Dataset]: # noqa: C901 logger.debug( "Not all necessary input variables to derive '%s' are " "available for %s with facets %s", - self["short_name"], + dataset["short_name"], new_ds.summary(shorten=True), updated_facets, ) - def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]: + @staticmethod + def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]: """Yield datasets based on the available files. This function requires that self.facets['mip'] is not a glob pattern. @@ -357,13 +360,13 @@ def _get_available_datasets(self, dataset: Dataset) -> Iterator[Dataset]: partially_defined = [] expanded = False for file in dataset_template.files: - new_dataset = self._file_to_dataset(dataset, file) + new_dataset = Dataset._file_to_dataset(dataset, file) # Do not use the timerange facet from the file because there may be # multiple files per dataset. new_dataset.facets.pop("timerange", None) # Restore the original timerange facet if it was specified. - if "timerange" in self.facets: - new_dataset.facets["timerange"] = self.facets["timerange"] + if "timerange" in dataset.facets: + new_dataset.facets["timerange"] = dataset.facets["timerange"] # Filter out identical datasets facetset = frozenset( @@ -455,7 +458,9 @@ def from_files(self) -> Iterator[Dataset]: for mip in mips: dataset_template = self.copy(mip=mip) - for dataset in dataset_template._get_all_available_datasets(): # noqa: SLF001 + for dataset in self._get_all_available_datasets( + dataset_template, + ): dataset._supplementaries_from_files() # noqa: SLF001 expanded = True yield dataset diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 5a14f38d5f..17ecd93535 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1657,7 +1657,7 @@ def test_from_files_with_derived_no_force_derivation( expected_input_dataset.session = session assert datasets[0].input_datasets == [expected_input_dataset] - assert expected_input_dataset.files == [lwcre_file] + assert datasets[0].input_datasets[0].files == [lwcre_file] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) From ff0cdd532d577a794e33647db7e5b299b776eb20 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 9 Jan 2026 17:22:29 +0100 Subject: [PATCH 71/85] input_datasets -> required_datasets --- esmvalcore/_recipe/recipe.py | 12 +- esmvalcore/_recipe/to_datasets.py | 23 ++-- esmvalcore/dataset.py | 52 ++++---- esmvalcore/preprocessor/_derive/__init__.py | 6 +- tests/integration/recipe/test_recipe.py | 10 +- tests/unit/recipe/test_to_datasets.py | 8 +- tests/unit/test_dataset.py | 129 +++++++++++--------- 7 files changed, 127 insertions(+), 113 deletions(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index a3b14c99f8..d54d35fba5 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -52,7 +52,7 @@ from . import check from .from_datasets import datasets_to_recipe from .to_datasets import ( - _get_input_datasets, + _get_required_datasets, _representative_datasets, ) @@ -670,8 +670,8 @@ def _get_preprocessor_products( _update_preproc_functions(settings, dataset, datasets, missing_vars) _add_dataset_specific_settings(dataset, settings) check.preprocessor_supplementaries(dataset, settings) - input_datasets = _get_input_datasets(dataset) - missing = _check_input_files(input_datasets) + required_datasets = _get_required_datasets(dataset) + missing = _check_input_files(required_datasets) if missing: if _allow_skipping(dataset): logger.info("Skipping: %s", missing) @@ -680,15 +680,15 @@ def _get_preprocessor_products( continue dataset.set_version() USED_DATASETS.append(dataset) - _schedule_for_download(input_datasets) - _log_input_files(input_datasets) + _schedule_for_download(required_datasets) + _log_input_files(required_datasets) logger.info("Found input files for %s", dataset.summary(shorten=True)) filename = _get_preprocessor_filename(dataset) product = PreprocessorFile( filename=filename, attributes=dataset.facets, settings=settings, - datasets=input_datasets, + datasets=required_datasets, ) products.add(product) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index b01e04707a..e992c767f8 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -489,30 +489,33 @@ def _report_unexpanded_globs( return msg -def _get_input_datasets(dataset: Dataset) -> list[Dataset]: - """Determine the input datasets needed for deriving `dataset`.""" +def _get_required_datasets(dataset: Dataset) -> list[Dataset]: + """Determine the datasets required for deriving `dataset`.""" if not dataset._derivation_necessary(): # noqa: SLF001 - return dataset.input_datasets + return dataset.required_datasets # Skip optional datasets if no data is available - input_datasets: list[Dataset] = [] - for input_dataset in dataset.input_datasets: - if input_dataset.facets.get("optional") and not input_dataset.files: + required_datasets: list[Dataset] = [] + for required_dataset in dataset.required_datasets: + if ( + required_dataset.facets.get("optional") + and not required_dataset.files + ): logger.info( "Skipping: no data found for %s which is marked as 'optional'", - input_dataset, + required_dataset, ) else: - input_datasets.append(input_dataset) + required_datasets.append(required_dataset) - return input_datasets + return required_datasets def _representative_datasets(dataset: Dataset) -> list[Dataset]: """Find representative datasets for all input variables.""" copy = dataset.copy() copy.supplementaries = [] - representative_datasets = _get_input_datasets(copy) + representative_datasets = _get_required_datasets(copy) for representative_dataset in representative_datasets: representative_dataset.supplementaries = dataset.supplementaries return representative_datasets diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 3b8ef0092b..9e03c3ddbf 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -130,7 +130,7 @@ def __init__(self, **facets: FacetValue) -> None: self._session: Session | None = None self._files: Sequence[DataElement] | None = None self._used_data_sources: Sequence[DataSource] = [] - self._input_datasets: list[Dataset] = [] + self._required_datasets: list[Dataset] | None = None for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -195,29 +195,29 @@ def _derivation_necessary(self) -> bool: return not ds_copy.files - def _get_input_datasets(self) -> list[Dataset]: - """Get input datasets.""" - input_datasets: list[Dataset] = [] + def _get_required_datasets(self) -> list[Dataset]: + """Get required datasets for derivation.""" + required_datasets: list[Dataset] = [] required_vars_facets = get_required( self.facets["short_name"], # type: ignore self.facets["project"], # type: ignore ) for required_facets in required_vars_facets: - input_dataset = self._copy(derive=False, force_derivation=False) + required_dataset = self._copy(derive=False, force_derivation=False) keep = {"alias", "recipe_dataset_index", *self.minimal_facets} - input_dataset.facets = { - k: v for k, v in input_dataset.facets.items() if k in keep + required_dataset.facets = { + k: v for k, v in required_dataset.facets.items() if k in keep } - input_dataset.facets.update(required_facets) - input_dataset.augment_facets() - input_datasets.append(input_dataset) + required_dataset.facets.update(required_facets) + required_dataset.augment_facets() + required_datasets.append(required_dataset) - return input_datasets + return required_datasets @property - def input_datasets(self) -> list[Dataset]: - """Get input datasets. + def required_datasets(self) -> list[Dataset]: + """Get required datasets. For non-derived variables (i.e., those with facet ``derive=False``), this will simply return the dataset itself in a list. @@ -231,15 +231,15 @@ def input_datasets(self) -> list[Dataset]: See also :func:`esmvalcore.preprocessor.derive` for an example usage. """ - if self._input_datasets: - return self._input_datasets + if self._required_datasets is not None: + return self._required_datasets if not self._derivation_necessary(): - self._input_datasets = [self] + self._required_datasets = [self] else: - self._input_datasets = self._get_input_datasets() + self._required_datasets = self._get_required_datasets() - return self._input_datasets + return self._required_datasets @staticmethod def _file_to_dataset( @@ -304,13 +304,13 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # noqa: # If forced derivation is requested or no datasets based on files from # dataset have been found, search for datasets based on files from - # input datasets + # required datasets if dataset._is_force_derived() or not datasets_found: all_datasets: list[list[tuple[dict, Dataset]]] = [] - for input_dataset in dataset._get_input_datasets(): + for required_dataset in dataset.required_datasets: all_datasets.append([]) for expanded_ds in Dataset._get_available_datasets( - input_dataset, + required_dataset, ): updated_facets = {} for key, value in dataset.facets.items(): @@ -334,7 +334,7 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # noqa: yield new_ds else: logger.debug( - "Not all necessary input variables to derive '%s' are " + "Not all variables required to derive '%s' are " "available for %s with facets %s", dataset["short_name"], new_ds.summary(shorten=True), @@ -429,8 +429,8 @@ def from_files(self) -> Iterator[Dataset]: dataset for those facets listed in :obj:`INHERITED_FACETS`. This also works for :ref:`derived variables `. The - input datasets that are necessary for derivation can be accessed via - :attr:`Dataset.input_datasets`. + datasets required for derivation can be accessed via + :attr:`Dataset.required_datasets`. Examples -------- @@ -748,8 +748,8 @@ def _get_version(dataset: Dataset) -> str | list[str]: def set_version(self) -> None: """Set the ``'version'`` facet based on the available data.""" versions: set[str] = set() - for input_dataset in self.input_datasets: - version = self._get_version(input_dataset) + for required_dataset in self.required_datasets: + version = self._get_version(required_dataset) if version: if isinstance(version, list): versions.update(version) diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index 3aa3b74e2c..3817b86bc7 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -107,8 +107,8 @@ def derive( Examples -------- - Input variables for derivation can be obtained via - :attr:`esmvalcore.dataset.Dataset.input_datasets`. + Required variables for derivation can be obtained via + :attr:`esmvalcore.dataset.Dataset.required_datasets`. For example, to derive the longwave cloud radiative effect (LWCRE) for the model CESM2, you can use: @@ -126,7 +126,7 @@ def derive( ... mip="Amon", ... derive=True, ... ) - >>> cubes = [d.load() for d in dataset.input_datasets] + >>> cubes = [d.load() for d in dataset.required_datasets] >>> cube = derive( ... cubes, ... short_name="lwcre", diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 7ebddfdfde..0a8fbc4f79 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -22,7 +22,7 @@ import esmvalcore.io.esgf import esmvalcore.io.local from esmvalcore._recipe.recipe import ( - _get_input_datasets, + _get_required_datasets, _representative_datasets, read_recipe_file, ) @@ -2609,7 +2609,7 @@ def test_representative_dataset_derived_var( "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", - # Added/changed by Dataset._get_input_datasets() + # Added/changed by Dataset._get_required_datasets() "derive": False, "force_derivation": False, } @@ -2679,7 +2679,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", - # Added/changed by Dataset._get_input_datasets() + # Added/changed by Dataset._get_required_datasets() "derive": False, "force_derivation": False, } @@ -2704,14 +2704,14 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", - # Added/changed by Dataset._get_input_datasets() + # Added/changed by Dataset._get_required_datasets() "derive": False, "force_derivation": False, } rsuscs = Dataset(**rsuscs_facets) rsuscs.session = session - alb_derive_input = _get_input_datasets(alb) + alb_derive_input = _get_required_datasets(alb) assert alb_derive_input == [rsdscs, rsuscs] diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index 1c279eafa6..443ec9b80a 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -285,7 +285,7 @@ def test_merge_supplementaries_missing_short_name_fails(session): Dataset.from_recipe(recipe_txt, session) -def test_get_input_datasets_derive(session): +def test_get_required_datasets_derive(session): dataset = Dataset( dataset="ERA5", project="native6", @@ -300,7 +300,7 @@ def test_get_input_datasets_derive(session): type="reanaly", version="v1", ) - rlds, rlns = to_datasets._get_input_datasets(dataset) + rlds, rlns = to_datasets._get_required_datasets(dataset) assert rlds["short_name"] == "rlds" assert rlds["long_name"] == "Surface Downwelling Longwave Radiation" assert rlds["frequency"] == "1hr" @@ -309,7 +309,7 @@ def test_get_input_datasets_derive(session): assert rlns["frequency"] == "1hr" -def test_get_input_datasets_derive_optional(caplog, tmp_path, session): +def test_get_required_datasets_optional(caplog, tmp_path, session): facets = { "project": "OBS6", "dataset": "SAT", @@ -333,7 +333,7 @@ def test_get_input_datasets_derive_optional(caplog, tmp_path, session): dataset.session = session with caplog.at_level(logging.INFO): - datasets = to_datasets._get_input_datasets(dataset) + datasets = to_datasets._get_required_datasets(dataset) expected = Dataset( dataset="SAT", diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 17ecd93535..e8cd1ca67a 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1317,7 +1317,7 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session): assert datasets[0].files == [lwcre_file] assert datasets[0].supplementaries[0].files == [] - expected_input_dataset = Dataset( + expected_required_dataset = Dataset( **OBS6_SAT_FACETS, short_name="lwcre", derive=True, @@ -1328,7 +1328,7 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session): standard_name="", units="W m-2", ) - expected_input_dataset.supplementaries = [ + expected_required_dataset.supplementaries = [ Dataset( **OBS6_SAT_FACETS, short_name="pr", @@ -1341,10 +1341,11 @@ def test_from_files_with_derived_no_derivation(lwcre_file, session): units="kg m-2 s-1", ), ] - expected_input_dataset.session = session + expected_required_dataset.session = session - assert datasets[0].input_datasets == [expected_input_dataset] - assert datasets[0].input_datasets[0].files == [lwcre_file] + required_datasets = datasets[0].required_datasets + assert required_datasets == [expected_required_dataset] + assert required_datasets[0].files == [lwcre_file] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1384,7 +1385,7 @@ def test_from_files_with_derived_no_derivation_glob( assert datasets[1].files == [lwcre_file] assert datasets[1].supplementaries[0].files == [pr_file] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **{**OBS6_SAT_FACETS, "type": "ground"}, short_name="lwcre", @@ -1408,7 +1409,7 @@ def test_from_files_with_derived_no_derivation_glob( units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.supplementaries = [ Dataset( **OBS6_SAT_FACETS, @@ -1426,12 +1427,12 @@ def test_from_files_with_derived_no_derivation_glob( for dataset, expected in zip( datasets, - expected_input_datasets, + expected_required_datasets, strict=True, ): - assert dataset.input_datasets == [expected] - assert datasets[0].input_datasets[0].files == [lwcre_file_ground] - assert datasets[1].input_datasets[0].files == [lwcre_file] + assert dataset.required_datasets == [expected] + assert datasets[0].required_datasets[0].files == [lwcre_file_ground] + assert datasets[1].required_datasets[0].files == [lwcre_file] def test_from_files_with_derived(rlut_file, rlutcs_file, session): @@ -1450,7 +1451,7 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): assert datasets[0].files == [] assert datasets[0].supplementaries[0].files == [] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **OBS6_SAT_FACETS, short_name="rlut", @@ -1474,12 +1475,13 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.session = session - assert datasets[0].input_datasets == expected_input_datasets - assert dataset.input_datasets[0].files == [rlut_file] - assert dataset.input_datasets[1].files == [rlutcs_file] + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] def test_from_files_with_derived_unavailable_years( @@ -1507,7 +1509,7 @@ def test_from_files_with_derived_unavailable_years( assert datasets == [expected] assert datasets[0].files == [] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, short_name="rlut", @@ -1531,12 +1533,13 @@ def test_from_files_with_derived_unavailable_years( units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.session = session - assert datasets[0].input_datasets == expected_input_datasets - assert dataset.input_datasets[0].files == [] - assert dataset.input_datasets[1].files == [] + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1569,7 +1572,7 @@ def test_from_files_with_derived_glob( assert datasets[0].files == [] assert datasets[0].supplementaries[0].files == [pr_file] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **OBS6_SAT_FACETS, short_name="rlut", @@ -1593,15 +1596,16 @@ def test_from_files_with_derived_glob( units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.session = session - assert datasets[0].input_datasets == expected_input_datasets - assert datasets[0].input_datasets[0].files == [rlut_file] - assert datasets[0].input_datasets[1].files == [rlutcs_file] + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] - msg = "Not all necessary input variables to derive 'lwcre' are available" + msg = "Not all variables required to derive 'lwcre' are available" for log_debug in log_debugs: if msg in log_debug: break @@ -1630,7 +1634,7 @@ def test_from_files_with_derived_no_force_derivation( assert datasets[0].files == [lwcre_file] assert datasets[0].supplementaries[0].files == [] - expected_input_dataset = Dataset( + expected_required_dataset = Dataset( **OBS6_SAT_FACETS, short_name="lwcre", derive=True, @@ -1641,7 +1645,7 @@ def test_from_files_with_derived_no_force_derivation( standard_name="", units="W m-2", ) - expected_input_dataset.supplementaries = [ + expected_required_dataset.supplementaries = [ Dataset( **OBS6_SAT_FACETS, short_name="pr", @@ -1654,10 +1658,11 @@ def test_from_files_with_derived_no_force_derivation( units="kg m-2 s-1", ), ] - expected_input_dataset.session = session + expected_required_dataset.session = session - assert datasets[0].input_datasets == [expected_input_dataset] - assert datasets[0].input_datasets[0].files == [lwcre_file] + required_datasets = datasets[0].required_datasets + assert required_datasets == [expected_required_dataset] + assert required_datasets[0].files == [lwcre_file] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1700,7 +1705,7 @@ def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 assert datasets[1].files == [lwcre_file] assert datasets[1].supplementaries[0].files == [pr_file] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **{**OBS6_SAT_FACETS, "type": "ground"}, short_name="lwcre", @@ -1724,7 +1729,7 @@ def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.supplementaries = [ Dataset( **OBS6_SAT_FACETS, @@ -1742,12 +1747,12 @@ def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 for dataset, expected in zip( datasets, - expected_input_datasets, + expected_required_datasets, strict=True, ): - assert dataset.input_datasets == [expected] - assert datasets[0].input_datasets[0].files == [lwcre_file_ground] - assert datasets[1].input_datasets[0].files == [lwcre_file] + assert dataset.required_datasets == [expected] + assert datasets[0].required_datasets[0].files == [lwcre_file_ground] + assert datasets[1].required_datasets[0].files == [lwcre_file] def test_from_files_with_derived_force_derivation( @@ -1781,7 +1786,7 @@ def test_from_files_with_derived_force_derivation( assert datasets[0].files == [lwcre_file] assert datasets[0].supplementaries[0].files == [] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **OBS6_SAT_FACETS, short_name="rlut", @@ -1807,12 +1812,13 @@ def test_from_files_with_derived_force_derivation( units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.session = session - assert datasets[0].input_datasets == expected_input_datasets - assert dataset.input_datasets[0].files == [rlut_file] - assert dataset.input_datasets[1].files == [rlutcs_file] + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] @pytest.mark.parametrize("timerange", ["1980/2000", "*"]) @@ -1853,7 +1859,7 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 assert datasets[0].files == [lwcre_file] assert datasets[0].supplementaries[0].files == [pr_file] - expected_input_datasets = [ + expected_required_datasets = [ Dataset( **OBS6_SAT_FACETS, short_name="rlut", @@ -1879,15 +1885,16 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 units="W m-2", ), ] - for expected_ds in expected_input_datasets: + for expected_ds in expected_required_datasets: expected_ds.session = session - assert datasets[0].input_datasets == expected_input_datasets - assert datasets[0].input_datasets[0].files == [rlut_file] - assert datasets[0].input_datasets[1].files == [rlutcs_file] + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] - msg = "Not all necessary input variables to derive 'lwcre' are available" + msg = "Not all variables required to derive 'lwcre' are available" for log_debug in log_debugs: if msg in log_debug: break @@ -2302,7 +2309,7 @@ def test_set_version_derived_var(monkeypatch, session): areacella_file.facets["version"] = "v4" dataset.supplementaries[0].files = [areacella_file] - def _get_input_datasets(): + def _get_required_datasets(): rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc") rlut_file.facets["version"] = "v1" rlut_dataset = Dataset( @@ -2323,7 +2330,11 @@ def _get_input_datasets(): rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2] return [rlut_dataset, rlutcs_dataset] - monkeypatch.setattr(dataset, "_get_input_datasets", _get_input_datasets) + monkeypatch.setattr( + dataset, + "_get_required_datasets", + _get_required_datasets, + ) dataset.set_version() @@ -2978,7 +2989,7 @@ def test_add_derived_supplementary_to_derived(): assert dataset.supplementaries[0] == expected_supplementary -def test_input_datasets_derivation(session): +def test_required_datasets_derivation(session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.add_supplementary(short_name="pr") @@ -3009,17 +3020,17 @@ def test_input_datasets_derivation(session): for expected_dataset in expected_datasets: expected_dataset.session = dataset.session - assert dataset.input_datasets == expected_datasets + assert dataset.required_datasets == expected_datasets -def test_input_datasets_no_derivation(): +def test_required_datasets_no_derivation(): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") dataset.add_supplementary(short_name="pr") - assert dataset.input_datasets == [dataset] + assert dataset.required_datasets == [dataset] -def test_input_datasets_no_force_derivation(tmp_path, session): +def test_required_datasets_no_force_derivation(tmp_path, session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.add_supplementary(short_name="pr") dataset.session = session @@ -3031,12 +3042,12 @@ def test_input_datasets_no_force_derivation(tmp_path, session): ) lwcre_file.touch() - assert dataset.input_datasets == [dataset] + assert dataset.required_datasets == [dataset] -def test_input_datasets_no_derivation_available(session): +def test_required_datasets_no_derivation_available(session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True) msg = r"Cannot derive variable 'tas': no derivation script available" with pytest.raises(NotImplementedError, match=msg): - dataset.input_datasets # noqa: B018 + dataset.required_datasets # noqa: B018 From de27a4b4c13c50b2fd7037db4e7d0d25e551730d Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 13 Jan 2026 16:26:49 +0100 Subject: [PATCH 72/85] Use bools for facet values of appropriate --- esmvalcore/preprocessor/_derive/siextent.py | 4 ++-- tests/unit/preprocessor/_derive/test_siextent.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/esmvalcore/preprocessor/_derive/siextent.py b/esmvalcore/preprocessor/_derive/siextent.py index 27aee25aec..d0beff2cbe 100644 --- a/esmvalcore/preprocessor/_derive/siextent.py +++ b/esmvalcore/preprocessor/_derive/siextent.py @@ -20,8 +20,8 @@ class DerivedVariable(DerivedVariableBase): def required(project): # noqa: ARG004 """Declare the variables needed for derivation.""" return [ - {"short_name": "sic", "optional": "true"}, - {"short_name": "siconca", "optional": "true"}, + {"short_name": "sic", "optional": True}, + {"short_name": "siconca", "optional": True}, ] @staticmethod diff --git a/tests/unit/preprocessor/_derive/test_siextent.py b/tests/unit/preprocessor/_derive/test_siextent.py index ae9f5d1c8f..416c9ac17b 100644 --- a/tests/unit/preprocessor/_derive/test_siextent.py +++ b/tests/unit/preprocessor/_derive/test_siextent.py @@ -113,6 +113,6 @@ def test_siextent_required(): derived_var = siextent.DerivedVariable() output = derived_var.required(None) assert output == [ - {"short_name": "sic", "optional": "true"}, - {"short_name": "siconca", "optional": "true"}, + {"short_name": "sic", "optional": True}, + {"short_name": "siconca", "optional": True}, ] From d8f5d08a690fc5a7692ecb7a4e774fa9b852234e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 13 Jan 2026 16:39:08 +0100 Subject: [PATCH 73/85] Simplify _get_all_available_datasets --- esmvalcore/dataset.py | 91 +++++++++++++++++++++++--------------- tests/unit/test_dataset.py | 16 ------- 2 files changed, 56 insertions(+), 51 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 9e03c3ddbf..870f046fdc 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -279,7 +279,23 @@ def _file_to_dataset( return new_dataset @staticmethod - def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # noqa: C901 + def _get_expanded_globs( + dataset_with_globs: Dataset, + dataset_with_expanded_globs: Dataset, + ) -> tuple[tuple[str, FacetValue], ...]: + """Get facets that have been updated by expanding globs.""" + expanded_globs: dict[str, FacetValue] = {} + for key, value in dataset_with_globs.facets.items(): + if ( + _isglob(value) + and key in dataset_with_expanded_globs.facets + and not _isglob(dataset_with_expanded_globs[key]) + ): + expanded_globs[key] = dataset_with_expanded_globs[key] + return tuple(expanded_globs.items()) + + @staticmethod + def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: """Yield datasets based on the available files. This function requires that dataset.facets['mip'] is not a glob @@ -303,43 +319,48 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # noqa: return # If forced derivation is requested or no datasets based on files from - # dataset have been found, search for datasets based on files from + # dataset have been found, search for datasets based on files from the # required datasets if dataset._is_force_derived() or not datasets_found: - all_datasets: list[list[tuple[dict, Dataset]]] = [] + # Record all expanded globs from first non-optional required + # dataset (called "reference_dataset" hereafter) + non_optional_datasets = [ + d + for d in dataset.required_datasets + if not d.facets.get("optional", False) + ] + if not non_optional_datasets: + msg = ( + f"Unable to retrieve available datasets for derived " + f"variable '{dataset.facets['short_name']}', all " + f"variables required for dervation are marked as " + f"'optional'" + ) + raise ValueError(msg) + reference_dataset = non_optional_datasets[0] + reference_expanded_globs = { + Dataset._get_expanded_globs(dataset, ds) + for ds in Dataset._get_available_datasets(reference_dataset) + } + + # Iterate through all other required datasets and only keep those + # expanded globs which are present for all other non-optional + # required datasets for required_dataset in dataset.required_datasets: - all_datasets.append([]) - for expanded_ds in Dataset._get_available_datasets( - required_dataset, - ): - updated_facets = {} - for key, value in dataset.facets.items(): - if _isglob(value): - if key in expanded_ds.facets and not _isglob( - expanded_ds[key], - ): - updated_facets[key] = expanded_ds.facets[key] - new_ds = dataset.copy() - new_ds.facets.update(updated_facets) - new_ds.supplementaries = dataset.supplementaries - - all_datasets[-1].append((updated_facets, new_ds)) - - # Only consider those datasets that contain all required variables - # with identical facets (e.g., skip those with different - # timeranges) - for updated_facets, new_ds in all_datasets[0]: - other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] - if all(updated_facets in facets for facets in other_facets): - yield new_ds - else: - logger.debug( - "Not all variables required to derive '%s' are " - "available for %s with facets %s", - dataset["short_name"], - new_ds.summary(shorten=True), - updated_facets, - ) + if required_dataset is reference_dataset: + continue + new_expanded_globs = { + Dataset._get_expanded_globs(dataset, ds) + for ds in Dataset._get_available_datasets(required_dataset) + } + reference_expanded_globs &= new_expanded_globs + + # Use the final expanded globs to create new datasets + for expanded_globs in reference_expanded_globs: + new_ds = dataset.copy() + new_ds.facets.update(expanded_globs) + new_ds.supplementaries = dataset.supplementaries + yield new_ds @staticmethod def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]: diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index e8cd1ca67a..a44b80b7f3 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1604,14 +1604,6 @@ def test_from_files_with_derived_glob( assert required_datasets[0].files == [rlut_file] assert required_datasets[1].files == [rlutcs_file] - log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] - msg = "Not all variables required to derive 'lwcre' are available" - for log_debug in log_debugs: - if msg in log_debug: - break - else: - pytest.fail(f"No debug message '{msg}'") - def test_from_files_with_derived_no_force_derivation( lwcre_file, @@ -1893,14 +1885,6 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 assert required_datasets[0].files == [rlut_file] assert required_datasets[1].files == [rlutcs_file] - log_debugs = [r.message for r in caplog.records if r.levelname == "DEBUG"] - msg = "Not all variables required to derive 'lwcre' are available" - for log_debug in log_debugs: - if msg in log_debug: - break - else: - pytest.fail(f"No debug message '{msg}'") - def test_match(): dataset1 = Dataset( From 096248983b44d7a950ed783d81ab071d88e7a671 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 13 Jan 2026 21:44:25 +0100 Subject: [PATCH 74/85] Simplify _get_all_available_datasets --- esmvalcore/_recipe/to_datasets.py | 15 ++--- esmvalcore/dataset.py | 84 +++++++++++---------------- tests/unit/recipe/test_to_datasets.py | 2 +- 3 files changed, 38 insertions(+), 63 deletions(-) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index e992c767f8..94a38afdbf 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -419,16 +419,13 @@ def _dataset_from_files(dataset: Dataset) -> list[Dataset]: dataset.summary(shorten=True), ) + # All the magic happens in Dataset.from_files. Here, we simply check if any + # wildcards have not been expanded and raise proper errors if necessary. for expanded_ds in dataset.from_files(): - updated_facets = {} unexpanded_globs = {} for key, value in dataset.facets.items(): if _isglob(value): - if key in expanded_ds.facets and not _isglob( - expanded_ds[key], - ): - updated_facets[key] = expanded_ds.facets[key] - else: + if key not in expanded_ds.facets or _isglob(expanded_ds[key]): unexpanded_globs[key] = value if unexpanded_globs: @@ -440,11 +437,7 @@ def _dataset_from_files(dataset: Dataset) -> list[Dataset]: errors.append(msg) continue - new_ds = dataset.copy() - new_ds.facets.update(updated_facets) - new_ds.supplementaries = expanded_ds.supplementaries - - result.append(new_ds) + result.append(expanded_ds) if errors: raise RecipeError("\n".join(errors)) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 870f046fdc..50ac2defab 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -183,7 +183,7 @@ def _derivation_necessary(self) -> bool: if self._is_force_derived(): return True - # Otherwise, derivation is necessary of no files for the self dataset + # Otherwise, derivation is necessary if no files for the self dataset # are found ds_copy = self.copy() ds_copy.supplementaries = [] @@ -305,62 +305,44 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: through variable derivation are returned. """ - datasets_found = False + if not dataset._derivation_necessary(): + yield from Dataset._get_available_datasets(dataset) + return - # If no forced derivation is requested, search for datasets based on - # files from dataset - if not dataset._is_force_derived(): - for available_ds in Dataset._get_available_datasets(dataset): - datasets_found = True - yield available_ds + # Since we are in full control of the derived variables (the module is + # private; no custom derivation functions are possible), we can be sure + # that the following list is never empty + non_optional_datasets = [ + d + for d in dataset.required_datasets + if not d.facets.get("optional", False) + ] - # For variables that cannot be derived, we are done here - if not dataset._is_derived(): - return + # Record all expanded globs from first non-optional required dataset + # (called "reference_dataset" hereafter) + reference_dataset = non_optional_datasets[0] + reference_expanded_globs = { + Dataset._get_expanded_globs(dataset, ds) + for ds in Dataset._get_available_datasets(reference_dataset) + } - # If forced derivation is requested or no datasets based on files from - # dataset have been found, search for datasets based on files from the - # required datasets - if dataset._is_force_derived() or not datasets_found: - # Record all expanded globs from first non-optional required - # dataset (called "reference_dataset" hereafter) - non_optional_datasets = [ - d - for d in dataset.required_datasets - if not d.facets.get("optional", False) - ] - if not non_optional_datasets: - msg = ( - f"Unable to retrieve available datasets for derived " - f"variable '{dataset.facets['short_name']}', all " - f"variables required for dervation are marked as " - f"'optional'" - ) - raise ValueError(msg) - reference_dataset = non_optional_datasets[0] - reference_expanded_globs = { + # Iterate through all other non-optional required datasets and only + # keep those expanded globs which are present for all other + # non-optional required datasets + for required_dataset in non_optional_datasets: + if required_dataset is reference_dataset: + continue + new_expanded_globs = { Dataset._get_expanded_globs(dataset, ds) - for ds in Dataset._get_available_datasets(reference_dataset) + for ds in Dataset._get_available_datasets(required_dataset) } + reference_expanded_globs &= new_expanded_globs - # Iterate through all other required datasets and only keep those - # expanded globs which are present for all other non-optional - # required datasets - for required_dataset in dataset.required_datasets: - if required_dataset is reference_dataset: - continue - new_expanded_globs = { - Dataset._get_expanded_globs(dataset, ds) - for ds in Dataset._get_available_datasets(required_dataset) - } - reference_expanded_globs &= new_expanded_globs - - # Use the final expanded globs to create new datasets - for expanded_globs in reference_expanded_globs: - new_ds = dataset.copy() - new_ds.facets.update(expanded_globs) - new_ds.supplementaries = dataset.supplementaries - yield new_ds + # Use the final expanded globs to create new dataset(s) + for expanded_globs in reference_expanded_globs: + new_ds = dataset.copy() + new_ds.facets.update(expanded_globs) + yield new_ds @staticmethod def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]: diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index 443ec9b80a..d07b4a583d 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -344,7 +344,7 @@ def test_get_required_datasets_optional(caplog, tmp_path, session): frequency="mon", long_name="Sea-Ice Area Percentage (Atmospheric Grid)", modeling_realm=["seaIce"], - optional="true", + optional=True, original_short_name="siconca", standard_name="sea_ice_area_fraction", tier=2, From 81da6e7fa06152c486722047f7324c3248ad228e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 14 Jan 2026 09:47:37 +0100 Subject: [PATCH 75/85] Using wildcards for derived variables with only optional required variables is not possible --- esmvalcore/dataset.py | 7 ++ esmvalcore/preprocessor/_derive/amoc.py | 4 +- esmvalcore/preprocessor/_derive/siextent.py | 4 +- tests/unit/test_dataset.py | 95 +++++++++++++++++++-- 4 files changed, 97 insertions(+), 13 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 50ac2defab..d722e87ae9 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -317,6 +317,13 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: for d in dataset.required_datasets if not d.facets.get("optional", False) ] + if not non_optional_datasets: + msg = ( + f"Using wildcards to derive {dataset.summary(shorten=True)} " + f"is not possible, derivation function only requires optional " + f"variables" + ) + raise RecipeError(msg) # Record all expanded globs from first non-optional required dataset # (called "reference_dataset" hereafter) diff --git a/esmvalcore/preprocessor/_derive/amoc.py b/esmvalcore/preprocessor/_derive/amoc.py index 3607aa1d62..67b179f0dd 100644 --- a/esmvalcore/preprocessor/_derive/amoc.py +++ b/esmvalcore/preprocessor/_derive/amoc.py @@ -72,9 +72,7 @@ def calculate(cubes): f"Amoc calculation: {cube_orig} doesn't contain" f" atlantic_arctic_ocean." ) - raise ValueError( - msg, - ) + raise ValueError(msg) # 2: Remove the shallowest 500m to avoid wind driven mixed layer. depth_constraint = iris.Constraint(depth=lambda d: d >= 500.0) diff --git a/esmvalcore/preprocessor/_derive/siextent.py b/esmvalcore/preprocessor/_derive/siextent.py index d0beff2cbe..5bd2ca82f1 100644 --- a/esmvalcore/preprocessor/_derive/siextent.py +++ b/esmvalcore/preprocessor/_derive/siextent.py @@ -53,9 +53,7 @@ def calculate(cubes): "Derivation of siextent failed due to missing variables " "sic and siconca." ) - raise RecipeError( - msg, - ) from exc + raise RecipeError(msg) from exc ones = da.ones_like(sic) siextent_data = da.ma.masked_where(sic.lazy_data() < 15.0, ones) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index a44b80b7f3..f27307e7d0 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,6 +1,5 @@ from __future__ import annotations -import logging import textwrap from collections import defaultdict from pathlib import Path @@ -1301,6 +1300,18 @@ def pr_file(tmp_path): return pr +@pytest.fixture +def siconca_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc", + ) + rlut.touch() + return rlut + + def test_from_files_with_derived_no_derivation(lwcre_file, session): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) @@ -1550,7 +1561,6 @@ def test_from_files_with_derived_glob( rlutcs_file, pr_file, session, - caplog, ): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( @@ -1561,8 +1571,7 @@ def test_from_files_with_derived_glob( dataset.add_supplementary(short_name="pr") dataset.session = session - with caplog.at_level(logging.DEBUG): - datasets = list(dataset.from_files()) + datasets = list(dataset.from_files()) expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) expected.add_supplementary(short_name="pr") @@ -1823,7 +1832,6 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 rlutcs_file, pr_file, session, - caplog, ): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( @@ -1835,8 +1843,7 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 dataset.add_supplementary(short_name="pr") dataset.session = session - with caplog.at_level(logging.DEBUG): - datasets = list(dataset.from_files()) + datasets = list(dataset.from_files()) expected = Dataset( **OBS6_SAT_FACETS, @@ -1886,6 +1893,80 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 assert required_datasets[1].files == [rlutcs_file] +def test_from_files_with_derived_only_optional(siconca_file, pr_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="siextent", + derive=True, + ) + dataset.add_supplementary(short_name="pr", mip="Amon") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="siextent", + derive=True, + ) + expected.add_supplementary(short_name="pr", mip="Amon") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="sic", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Ocean Grid)", + modeling_realm=["seaIce"], + original_short_name="siconc", + standard_name="sea_ice_area_fraction", + units="%", + optional=True, + ), + Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="siconca", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Atmospheric Grid)", + modeling_realm=["seaIce"], + original_short_name="siconca", + standard_name="sea_ice_area_fraction", + units="%", + optional=True, + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [siconca_file] + + +def test_from_files_with_derived_only_optional_glob_fail(session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon", "type": "*"}, + short_name="siextent", + derive=True, + ) + dataset.add_supplementary(short_name="pr", mip="Amon") + dataset.session = session + + msg = r"Using wildcards to derive .* is not possible" + with pytest.raises(RecipeError, match=msg): + next(dataset.from_files()) + + def test_match(): dataset1 = Dataset( short_name="areacella", From ade0bced772c87fcce81d9ced4c1e23abf031fdc Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 14 Jan 2026 10:06:12 +0100 Subject: [PATCH 76/85] Explicitly cast tuple[tuple] to dict --- esmvalcore/_recipe/to_datasets.py | 2 +- esmvalcore/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 94a38afdbf..50b3364768 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -419,7 +419,7 @@ def _dataset_from_files(dataset: Dataset) -> list[Dataset]: dataset.summary(shorten=True), ) - # All the magic happens in Dataset.from_files. Here, we simply check if any + # The magic happens in Dataset.from_files. Here, we simply check if any # wildcards have not been expanded and raise proper errors if necessary. for expanded_ds in dataset.from_files(): unexpanded_globs = {} diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index d722e87ae9..3413c6601f 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -348,7 +348,7 @@ def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: # Use the final expanded globs to create new dataset(s) for expanded_globs in reference_expanded_globs: new_ds = dataset.copy() - new_ds.facets.update(expanded_globs) + new_ds.facets.update(dict(expanded_globs)) yield new_ds @staticmethod From d725654c33ad0a94c5f5cc703a14d28af7647b15 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 14 Jan 2026 10:31:49 +0100 Subject: [PATCH 77/85] Do not return any files for required variables if no facets match at all --- esmvalcore/dataset.py | 61 ++++++++++++++++++++++---------------- tests/unit/test_dataset.py | 57 +++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 25 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 3413c6601f..44bff90bf3 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -451,35 +451,46 @@ def from_files(self) -> Iterator[Dataset]: Dataset Datasets representing the available files. """ - expanded = False - if any(_isglob(v) for v in self.facets.values()): - if _isglob(self.facets["mip"]): - available_mips = _get_mips( - self.facets["project"], # type: ignore - self.facets["short_name"], # type: ignore - ) - mips = [ - mip - for mip in available_mips - if _ismatch(mip, self.facets["mip"]) - ] - else: - mips = [self.facets["mip"]] # type: ignore + # No wildcards present -> simply return self with expanded + # supplementaries + if not any(_isglob(v) for v in self.facets.values()): + self._supplementaries_from_files() + yield self + return - for mip in mips: - dataset_template = self.copy(mip=mip) - for dataset in self._get_all_available_datasets( - dataset_template, - ): - dataset._supplementaries_from_files() # noqa: SLF001 - expanded = True - yield dataset + # Wildcards present -> expand them + expanded = False + if _isglob(self.facets["mip"]): + available_mips = _get_mips( + self.facets["project"], # type: ignore + self.facets["short_name"], # type: ignore + ) + mips = [ + mip + for mip in available_mips + if _ismatch(mip, self.facets["mip"]) + ] + else: + mips = [self.facets["mip"]] # type: ignore + for mip in mips: + dataset_template = self.copy(mip=mip) + for dataset in self._get_all_available_datasets( + dataset_template, + ): + dataset._supplementaries_from_files() # noqa: SLF001 + expanded = True + yield dataset + + # If files were found, or the file facets didn't match the + # specification, yield the original, but do expand any supplementary + # globs. For derived variables, make sure to purge any files found for + # required variables; those won't match in their facets. if not expanded: - # If the definition contains no wildcards, no files were found, - # or the file facets didn't match the specification, yield the - # original, but do expand any supplementary globs. self._supplementaries_from_files() + if self._derivation_necessary(): + for required_dataset in self.required_datasets: + required_dataset.files = [] yield self def _supplementaries_from_files(self) -> None: diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index f27307e7d0..ee46759a73 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1614,6 +1614,63 @@ def test_from_files_with_derived_glob( assert required_datasets[1].files == [rlutcs_file] +def test_from_files_with_derived_glob_differing_timerange( + rlut_file_future, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + expected.session = session + assert datasets == [expected] + assert datasets[0].files == [] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] + + def test_from_files_with_derived_no_force_derivation( lwcre_file, rlut_file, From d5234a750160c7cf1a3e6c06b9bdba4b6e78b4bd Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 14 Jan 2026 14:35:11 +0100 Subject: [PATCH 78/85] Add supplementaries to required datasets --- esmvalcore/dataset.py | 2 + tests/unit/test_dataset.py | 80 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 44bff90bf3..951565c817 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -205,6 +205,8 @@ def _get_required_datasets(self) -> list[Dataset]: for required_facets in required_vars_facets: required_dataset = self._copy(derive=False, force_derivation=False) + for supplementary in self.supplementaries: + required_dataset.supplementaries.append(supplementary.copy()) keep = {"alias", "recipe_dataset_index", *self.minimal_facets} required_dataset.facets = { k: v for k, v in required_dataset.facets.items() if k in keep diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index ee46759a73..ee9f125782 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1487,6 +1487,19 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): ), ] for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -1606,6 +1619,19 @@ def test_from_files_with_derived_glob( ), ] for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -1871,6 +1897,20 @@ def test_from_files_with_derived_force_derivation( ), ] for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + force_derivation=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -1942,6 +1982,20 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 ), ] for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + force_derivation=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2001,6 +2055,19 @@ def test_from_files_with_derived_only_optional(siconca_file, pr_file, session): ), ] for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -3140,6 +3207,19 @@ def test_required_datasets_derivation(session): ), ] for expected_dataset in expected_datasets: + expected_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_dataset.session = dataset.session assert dataset.required_datasets == expected_datasets From 2226ebd65d5ba411cf86e2cf63568f13cb13be7a Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 14 Jan 2026 17:55:10 +0100 Subject: [PATCH 79/85] Add test cases for derived variables with optional variable --- tests/unit/test_dataset.py | 297 +++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index ee9f125782..352ad46e4d 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -16,6 +16,7 @@ from esmvalcore.dataset import Dataset from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.io.esgf import ESGFFile +from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase if TYPE_CHECKING: from esmvalcore.typing import Facets @@ -1643,6 +1644,7 @@ def test_from_files_with_derived_glob( def test_from_files_with_derived_glob_differing_timerange( rlut_file_future, rlutcs_file, + pr_file, session, ): """Test `from_files` with derived variable and supplementary.""" @@ -1651,6 +1653,7 @@ def test_from_files_with_derived_glob_differing_timerange( short_name="lwcre", derive=True, ) + dataset.add_supplementary(short_name="pr") dataset.session = session datasets = list(dataset.from_files()) @@ -1660,6 +1663,7 @@ def test_from_files_with_derived_glob_differing_timerange( short_name="lwcre", derive=True, ) + expected.add_supplementary(short_name="pr", timerange="1980/2000") expected.session = session assert datasets == [expected] assert datasets[0].files == [] @@ -1689,6 +1693,19 @@ def test_from_files_with_derived_glob_differing_timerange( ), ] for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2004,6 +2021,286 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 assert required_datasets[1].files == [rlutcs_file] +class DerivedVariable(DerivedVariableBase): + """Derivation of dummy variable.""" + + @staticmethod + def required(project): + """Declare the variables needed for derivation.""" + return [ + {"short_name": "rlut", "optional": True}, + {"short_name": "rlutcs"}, + {"short_name": "pr"}, + ] + + +def test_from_files_with_derived_optional( + monkeypatch, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + monkeypatch.setattr( + esmvalcore.preprocessor._derive, + "ALL_DERIVED_VARIABLES", + {"tas": DerivedVariable}, + ) + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + derive=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + optional=True, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [rlutcs_file] + assert required_datasets[2].files == [pr_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob_optional( + timerange, + monkeypatch, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + monkeypatch.setattr( + esmvalcore.preprocessor._derive, + "ALL_DERIVED_VARIABLES", + {"tas": DerivedVariable}, + ) + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="tas", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + derive=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + optional=True, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [rlutcs_file] + assert required_datasets[2].files == [pr_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob_optional_missing( + timerange, + monkeypatch, + rlut_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + monkeypatch.setattr( + esmvalcore.preprocessor._derive, + "ALL_DERIVED_VARIABLES", + {"tas": DerivedVariable}, + ) + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="tas", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="tas", + derive=True, + ) + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="rlut", + derive=False, + optional=True, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] + assert required_datasets[2].files == [] + + def test_from_files_with_derived_only_optional(siconca_file, pr_file, session): """Test `from_files` with derived variable and supplementary.""" dataset = Dataset( From bb72b6ac049092286d8b91c6ecf193f69780f0e3 Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Wed, 14 Jan 2026 22:03:30 +0100 Subject: [PATCH 80/85] Update esmvalcore/dataset.py Co-authored-by: Bouwe Andela --- esmvalcore/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 951565c817..0266850fcf 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -221,7 +221,7 @@ def _get_required_datasets(self) -> list[Dataset]: def required_datasets(self) -> list[Dataset]: """Get required datasets. - For non-derived variables (i.e., those with facet ``derive=False``), + For non-derived variables (i.e., those without a ``derive`` facet or with facet ``derive=False``), this will simply return the dataset itself in a list. For derived variables (i.e., those with facet ``derive=True``), this From 8dd2fdf5823672569359f8d555b0b9d9b533ed49 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 14 Jan 2026 22:04:38 +0100 Subject: [PATCH 81/85] Fix indentation --- esmvalcore/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 0266850fcf..5555f77c70 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -221,8 +221,9 @@ def _get_required_datasets(self) -> list[Dataset]: def required_datasets(self) -> list[Dataset]: """Get required datasets. - For non-derived variables (i.e., those without a ``derive`` facet or with facet ``derive=False``), - this will simply return the dataset itself in a list. + For non-derived variables (i.e., those without a ``derive`` facet or + with facet ``derive=False``), this will simply return the dataset + itself in a list. For derived variables (i.e., those with facet ``derive=True``), this will return the datasets required for derivation if derivation is From 9b28c0f1a6591f6ce54c7dbad5c12842d55bbfc2 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 16 Jan 2026 22:14:54 +0100 Subject: [PATCH 82/85] FIrst update of notebook --- notebooks/discovering-data.ipynb | 180 ++++++++++++++++++------------- 1 file changed, 107 insertions(+), 73 deletions(-) diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index 581e8ca249..e43df8b34e 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -13,14 +13,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "f0ccfe7f-c535-4606-99ce-be24960aece1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR 1: PROJ: proj_create_from_database: Open of /home/manuel/micromamba/envs/esm/share/proj failed\n" + ] + } + ], "source": [ "from esmvalcore.config import CFG\n", - "from esmvalcore.dataset import Dataset\n", - "from esmvalcore.esgf import download" + "from esmvalcore.dataset import Dataset" ] }, { @@ -39,7 +46,32 @@ "metadata": {}, "outputs": [], "source": [ - "CFG[\"search_esgf\"] = \"always\"" + "CFG[\"search_data\"] = \"complete\"\n", + "CFG.nested_update(\n", + " {\n", + " \"projects\": {\n", + " \"CMIP6\": {\n", + " \"data\": {\n", + " \"intake-esgf\": {\n", + " \"type\": \"esmvalcore.io.intake_esgf.IntakeESGFDataSource\",\n", + " \"priority\": 2,\n", + " \"facets\": {\n", + " \"activity\": \"activity_drs\",\n", + " \"dataset\": \"source_id\",\n", + " \"ensemble\": \"member_id\",\n", + " \"exp\": \"experiment_id\",\n", + " \"institute\": \"institution_id\",\n", + " \"grid\": \"grid_label\",\n", + " \"mip\": \"table_id\",\n", + " \"project\": \"project\",\n", + " \"short_name\": \"variable_id\",\n", + " },\n", + " },\n", + " },\n", + " },\n", + " },\n", + " },\n", + ")" ] }, { @@ -89,7 +121,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 727 datasets, showing the first 10:\n" + "Found 906 datasets, showing the first 10:\n" ] }, { @@ -168,20 +200,20 @@ " 'grid': 'gn',\n", " 'institute': 'AWI'},\n", " Dataset:\n", - " {'dataset': 'AWI-ESM-1-REcoM',\n", + " {'dataset': 'BCC-CSM2-MR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", " 'ensemble': 'r1i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", - " 'institute': 'AWI'},\n", + " 'institute': 'BCC'},\n", " Dataset:\n", " {'dataset': 'BCC-CSM2-MR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'tas',\n", - " 'ensemble': 'r1i1p1f1',\n", + " 'ensemble': 'r2i1p1f1',\n", " 'exp': 'historical',\n", " 'grid': 'gn',\n", " 'institute': 'BCC'}]" @@ -253,7 +285,7 @@ { "data": { "text/plain": [ - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]" + "[IntakeESGFDataset(name='CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn')]" ] }, "execution_count": 6, @@ -270,7 +302,7 @@ "id": "60d88a34-c886-4b9d-a9e9-a9d18fa97917", "metadata": {}, "source": [ - "A single file can be downloaded using its `download` method:" + "Load a single file as `iris.cube.CubeList`:" ] }, { @@ -280,37 +312,44 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "LocalFile('/home/manuel/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", + " self.close()\n", + " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", + " self.disp(bar_style='danger', check_delay=False)\n", + "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n", + "/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:712: UserWarning: We could not download your entire catalog, missed={'CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn'}\n", + " warnings.warn(f\"We could not download your entire catalog, {missed=}\")\n" + ] + }, + { + "ename": "DatasetLoadError", + "evalue": "We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mDatasetLoadError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m cubes = \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_iris\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:163\u001b[39m, in \u001b[36mIntakeESGFDataset.to_iris\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 155\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mto_iris\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> iris.cube.CubeList:\n\u001b[32m 156\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Load the data as Iris cubes.\u001b[39;00m\n\u001b[32m 157\u001b[39m \n\u001b[32m 158\u001b[39m \u001b[33;03m Returns\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 161\u001b[39m \u001b[33;03m The loaded data.\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m163\u001b[39m files = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcatalog\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 164\u001b[39m \u001b[43m \u001b[49m\u001b[43mminimal_keys\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 165\u001b[39m \u001b[43m \u001b[49m\u001b[43mquiet\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 166\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m 167\u001b[39m dataset = \u001b[38;5;28mself\u001b[39m.catalog.to_dataset_dict(\n\u001b[32m 168\u001b[39m minimal_keys=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m 169\u001b[39m add_measures=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m 170\u001b[39m quiet=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m 171\u001b[39m )[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m 172\u001b[39m \u001b[38;5;66;03m# Store the local paths in the attributes for easier debugging.\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:96\u001b[39m, in \u001b[36m_CachingCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m 94\u001b[39m key = \u001b[38;5;28mtuple\u001b[39m((k, v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m kwargs.items() \u001b[38;5;28;01mif\u001b[39;00m k != \u001b[33m\"\u001b[39m\u001b[33mquiet\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result:\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m \u001b[38;5;28mself\u001b[39m._result[key] = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result[key]\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:714\u001b[39m, in \u001b[36mESGFCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m 712\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mWe could not download your entire catalog, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmissed\u001b[38;5;132;01m=}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 713\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mbreak_on_error\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m--> \u001b[39m\u001b[32m714\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetLoadError(\n\u001b[32m 715\u001b[39m \u001b[38;5;28mlist\u001b[39m(missed),\n\u001b[32m 716\u001b[39m \u001b[38;5;28mself\u001b[39m.logger.read()\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mprint_log_on_error\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 718\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 719\u001b[39m )\n\u001b[32m 721\u001b[39m \u001b[38;5;66;03m# optionally simplify the keys\u001b[39;00m\n\u001b[32m 722\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m minimal_keys:\n", + "\u001b[31mDatasetLoadError\u001b[39m: We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'." + ] } ], "source": [ - "dataset.files[0].download(CFG[\"download_dir\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3821b594-3797-497b-a51d-1798d5b2fc80", - "metadata": {}, - "source": [ - "For downloading many files, the [esmvalcore.esgf.download](https://docs.esmvaltool.org/projects/esmvalcore/en/latest/api/esmvalcore.esgf.html#esmvalcore.esgf.download) function is recommended because it will download the files in parallel. The ESMValCore will try to guess the fastest host and download from there. If it is not available for some reason, it will automatically fall back to the next host." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9676ff81-232e-4ff8-b784-686f0d06c469", - "metadata": {}, - "outputs": [], - "source": [ - "download(dataset.files, CFG[\"download_dir\"])" + "cubes = dataset.files[0].to_iris()" ] }, { @@ -323,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "b75314e3", "metadata": {}, "outputs": [], @@ -344,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "b87c247f", "metadata": {}, "outputs": [ @@ -352,14 +391,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 36 datasets, showing the first 10:\n" + "Found 38 datasets, showing the first 10:\n" ] }, { "data": { "text/plain": [ "[Dataset:\n", - " {'dataset': 'TaiESM1',\n", + " {'dataset': 'SAM0-UNICON',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -368,9 +407,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'AS-RCEC'},\n", + " 'institute': 'SNU'},\n", " Dataset:\n", - " {'dataset': 'AWI-CM-1-1-MR',\n", + " {'dataset': 'AWI-ESM-1-1-LR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -381,7 +420,7 @@ " 'grid': 'gn',\n", " 'institute': 'AWI'},\n", " Dataset:\n", - " {'dataset': 'AWI-ESM-1-1-LR',\n", + " {'dataset': 'CMCC-CM2-HR4',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -390,9 +429,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'AWI'},\n", + " 'institute': 'CMCC'},\n", " Dataset:\n", - " {'dataset': 'AWI-ESM-1-REcoM',\n", + " {'dataset': 'CESM2-WACCM',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -401,9 +440,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'AWI'},\n", + " 'institute': 'NCAR'},\n", " Dataset:\n", - " {'dataset': 'BCC-CSM2-MR',\n", + " {'dataset': 'BCC-ESM1',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -414,7 +453,7 @@ " 'grid': 'gn',\n", " 'institute': 'BCC'},\n", " Dataset:\n", - " {'dataset': 'BCC-ESM1',\n", + " {'dataset': 'GISS-E2-1-H',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -423,9 +462,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'BCC'},\n", + " 'institute': 'NASA-GISS'},\n", " Dataset:\n", - " {'dataset': 'CAMS-CSM1-0',\n", + " {'dataset': 'MRI-ESM2-0',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -434,9 +473,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'CAMS'},\n", + " 'institute': 'MRI'},\n", " Dataset:\n", - " {'dataset': 'CAS-ESM2-0',\n", + " {'dataset': 'TaiESM1',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -445,9 +484,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'CAS'},\n", + " 'institute': 'AS-RCEC'},\n", " Dataset:\n", - " {'dataset': 'FGOALS-g3',\n", + " {'dataset': 'CAMS-CSM1-0',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -456,9 +495,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'CAS'},\n", + " 'institute': 'CAMS'},\n", " Dataset:\n", - " {'dataset': 'IITM-ESM',\n", + " {'dataset': 'MPI-ESM-1-2-HAM',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -467,10 +506,10 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'CCCR-IITM'}]" + " 'institute': 'HAMMOZ-Consortium'}]" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -499,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "c5edfa65", "metadata": {}, "outputs": [ @@ -509,7 +548,7 @@ "[]" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -530,14 +569,14 @@ "output_type": "stream", "text": [ "rlut\n", - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlut/gn/v20200623/rlut_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf3.dkrz.de']]\n", + "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlut.gn')]\n", "rlutcs\n", - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/rlutcs/gn/v20200623/rlutcs_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de']]\n" + "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlutcs.gn')]\n" ] } ], "source": [ - "for d in dataset.input_datasets:\n", + "for d in dataset.required_datasets:\n", " print(d[\"short_name\"])\n", " print(d.files)" ] @@ -545,7 +584,7 @@ ], "metadata": { "kernelspec": { - "display_name": "esm", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -559,12 +598,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "vscode": { - "interpreter": { - "hash": "17e81e49408864327be43d3caebcb8eca32ff92a01becb15aa27be73c37f0517" - } + "version": "3.13.11" } }, "nbformat": 4, From 6d8ba227d205fdcfd0538e1c93e973593ab490d4 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 19 Jan 2026 10:46:25 +0100 Subject: [PATCH 83/85] Update example notebook --- notebooks/discovering-data.ipynb | 498 ++++++++++++++++++++++++++----- 1 file changed, 430 insertions(+), 68 deletions(-) diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index e43df8b34e..10676a8b72 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -13,18 +13,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "f0ccfe7f-c535-4606-99ce-be24960aece1", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR 1: PROJ: proj_create_from_database: Open of /home/manuel/micromamba/envs/esm/share/proj failed\n" - ] - } - ], + "outputs": [], "source": [ "from esmvalcore.config import CFG\n", "from esmvalcore.dataset import Dataset" @@ -47,6 +39,7 @@ "outputs": [], "source": [ "CFG[\"search_data\"] = \"complete\"\n", + "CFG[\"projects\"].pop(\"CMIP6\", None) # Clear existing CMIP6 configuration\n", "CFG.nested_update(\n", " {\n", " \"projects\": {\n", @@ -312,44 +305,413 @@ "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Exception ignored in: \n", - "Traceback (most recent call last):\n", - " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", - " self.close()\n", - " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", - " self.disp(bar_style='danger', check_delay=False)\n", - "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n", - "Exception ignored in: \n", - "Traceback (most recent call last):\n", - " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/std.py\", line 1148, in __del__\n", - " self.close()\n", - " File \"/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/tqdm/notebook.py\", line 279, in close\n", - " self.disp(bar_style='danger', check_delay=False)\n", - "AttributeError: 'tqdm_notebook' object has no attribute 'disp'\n", - "/home/manuel/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:712: UserWarning: We could not download your entire catalog, missed={'CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn'}\n", - " warnings.warn(f\"We could not download your entire catalog, {missed=}\")\n" - ] - }, - { - "ename": "DatasetLoadError", - "evalue": "We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'.", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mDatasetLoadError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m cubes = \u001b[43mdataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_iris\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:163\u001b[39m, in \u001b[36mIntakeESGFDataset.to_iris\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 155\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mto_iris\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> iris.cube.CubeList:\n\u001b[32m 156\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Load the data as Iris cubes.\u001b[39;00m\n\u001b[32m 157\u001b[39m \n\u001b[32m 158\u001b[39m \u001b[33;03m Returns\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 161\u001b[39m \u001b[33;03m The loaded data.\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m163\u001b[39m files = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcatalog\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 164\u001b[39m \u001b[43m \u001b[49m\u001b[43mminimal_keys\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 165\u001b[39m \u001b[43m \u001b[49m\u001b[43mquiet\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 166\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m 167\u001b[39m dataset = \u001b[38;5;28mself\u001b[39m.catalog.to_dataset_dict(\n\u001b[32m 168\u001b[39m minimal_keys=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m 169\u001b[39m add_measures=\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[32m 170\u001b[39m quiet=\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[32m 171\u001b[39m )[\u001b[38;5;28mself\u001b[39m.name]\n\u001b[32m 172\u001b[39m \u001b[38;5;66;03m# Store the local paths in the attributes for easier debugging.\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/ESMValCore/esmvalcore/io/intake_esgf.py:96\u001b[39m, in \u001b[36m_CachingCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m 94\u001b[39m key = \u001b[38;5;28mtuple\u001b[39m((k, v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m kwargs.items() \u001b[38;5;28;01mif\u001b[39;00m k != \u001b[33m\"\u001b[39m\u001b[33mquiet\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result:\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m \u001b[38;5;28mself\u001b[39m._result[key] = \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_path_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._result[key]\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/micromamba/envs/esm/lib/python3.13/site-packages/intake_esgf/catalog.py:714\u001b[39m, in \u001b[36mESGFCatalog.to_path_dict\u001b[39m\u001b[34m(self, prefer_streaming, globus_endpoint, globus_path, minimal_keys, ignore_facets, separator, quiet)\u001b[39m\n\u001b[32m 712\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mWe could not download your entire catalog, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmissed\u001b[38;5;132;01m=}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 713\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mbreak_on_error\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m--> \u001b[39m\u001b[32m714\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetLoadError(\n\u001b[32m 715\u001b[39m \u001b[38;5;28mlist\u001b[39m(missed),\n\u001b[32m 716\u001b[39m \u001b[38;5;28mself\u001b[39m.logger.read()\n\u001b[32m 717\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m intake_esgf.conf[\u001b[33m\"\u001b[39m\u001b[33mprint_log_on_error\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 718\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 719\u001b[39m )\n\u001b[32m 721\u001b[39m \u001b[38;5;66;03m# optionally simplify the keys\u001b[39;00m\n\u001b[32m 722\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m minimal_keys:\n", - "\u001b[31mDatasetLoadError\u001b[39m: We were unable to load data for these keys:\n- CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn\nThis could be for a few reasons:\n- We failed to find file access information for these datasets.\n- All the access links we found failed. The data nodes may be offline.\nFor more information, consult the session log 'print(cat.session_log())'." - ] + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "
Air Temperature (K)timelatitudelongitude
Shape1980192288
Dimension coordinates
\ttimex--
\tlatitude-x-
\tlongitude--x
Scalar coordinates
\theight2.0 m
Cell methods
\t0area: time: mean
Attributes
\tConventions'CF-1.7 CMIP-6.2'
\tactivity_drs'CMIP'
\tactivity_id'CMIP'
\tbranch_method'Hybrid-restart from year 0671-01-01 of piControl'
\tbranch_timenp.float64(0.0)
\tbranch_time_in_childnp.float64(0.0)
\tbranch_time_in_parentnp.float64(171550.0)
\tcmor_version'3.5.0'
\tcomment'near-surface (usually, 2 meter) air temperature'
\tcontact'Dr. Wei-Liang Lee (leelupin@gate.sinica.edu.tw)'
\tcreation_date'2020-06-08T08:53:23Z'
\tdata_specs_version'01.00.31'
\texperiment'all-forcing simulation of the recent past'
\texperiment_id'historical'
\texternal_variables'areacella'
\tforcing_indexnp.int32(1)
\tfrequency'mon'
\tfurther_info_url'https://furtherinfo.es-doc.org/CMIP6.AS-RCEC.TaiESM1.historical.none.r ...'
\tgrid'finite-volume grid with 0.9x1.25 degree lat/lon resolution'
\tgrid_label'gn'
\thistory"2020-06-08T08:53:23Z altered by CMOR: Treated scalar dimension: 'height'. ..."
\tinitialization_indexnp.int32(1)
\tinstitution'Research Center for Environmental Changes, Academia Sinica, Nankang, Taipei ...'
\tinstitution_id'AS-RCEC'
\tlicense'CMIP6 model data produced by NCC is licensed under a Creative Commons Attribution ...'
\tmember_id'r1i1p1f1'
\tmip_era'CMIP6'
\tmodel_id'TaiESM1'
\tnominal_resolution'100 km'
\toriginal_name'TREFHT'
\tparent_activity_id'CMIP'
\tparent_experiment_id'piControl'
\tparent_mip_era'CMIP6'
\tparent_source_id'TaiESM1'
\tparent_sub_experiment_id'none'
\tparent_time_units'days since 1850-01-01'
\tparent_variant_label'r1i1p1f1'
\tphysics_indexnp.int32(1)
\tproduct'model-output'
\trealization_indexnp.int32(1)
\trealm'atmos'
\treferences'10.5194/gmd-2019-377'
\trun_variant'N/A'
\tsource'TaiESM 1.0 (2018): \\naerosol: SNAP (same grid as atmos)\\natmos: TaiAM1 ...'
\tsource_file'/mnt/d/data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn ...'
\tsource_id'TaiESM1'
\tsource_type'AOGCM AER BGC'
\tsub_experiment'none'
\tsub_experiment_id'none'
\ttable_id'Amon'
\ttable_info'Creation Date:(24 July 2019) MD5:0bb394a356ef9d214d027f1aca45853e'
\ttitle'TaiESM1 output prepared for CMIP6'
\ttracking_id'hdl:21.14100/997cf563-6411-4a78-a9c4-7369ae27d698'
\tvariable_id'tas'
\tvariant_label'r1i1p1f1'
\n", + "

\n", + "
\n", + " \n", + " " + ], + "text/plain": [ + "[]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "cubes = dataset.files[0].to_iris()" + "cubes = dataset.files[0].to_iris()\n", + "cubes" ] }, { @@ -391,14 +753,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 38 datasets, showing the first 10:\n" + "Found 37 datasets, showing the first 10:\n" ] }, { "data": { "text/plain": [ "[Dataset:\n", - " {'dataset': 'SAM0-UNICON',\n", + " {'dataset': 'GISS-E2-2-G',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -407,9 +769,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'SNU'},\n", + " 'institute': 'NASA-GISS'},\n", " Dataset:\n", - " {'dataset': 'AWI-ESM-1-1-LR',\n", + " {'dataset': 'FGOALS-g3',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -418,9 +780,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'AWI'},\n", + " 'institute': 'CAS'},\n", " Dataset:\n", - " {'dataset': 'CMCC-CM2-HR4',\n", + " {'dataset': 'CESM2-WACCM-FV2',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -429,9 +791,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'CMCC'},\n", + " 'institute': 'NCAR'},\n", " Dataset:\n", - " {'dataset': 'CESM2-WACCM',\n", + " {'dataset': 'GISS-E2-1-H',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -440,9 +802,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'NCAR'},\n", + " 'institute': 'NASA-GISS'},\n", " Dataset:\n", - " {'dataset': 'BCC-ESM1',\n", + " {'dataset': 'BCC-CSM2-MR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -453,7 +815,7 @@ " 'grid': 'gn',\n", " 'institute': 'BCC'},\n", " Dataset:\n", - " {'dataset': 'GISS-E2-1-H',\n", + " {'dataset': 'CAS-ESM2-0',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -462,9 +824,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'NASA-GISS'},\n", + " 'institute': 'CAS'},\n", " Dataset:\n", - " {'dataset': 'MRI-ESM2-0',\n", + " {'dataset': 'MPI-ESM-1-2-HAM',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -473,9 +835,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'MRI'},\n", + " 'institute': 'HAMMOZ-Consortium'},\n", " Dataset:\n", - " {'dataset': 'TaiESM1',\n", + " {'dataset': 'CESM2-FV2',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -484,9 +846,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'AS-RCEC'},\n", + " 'institute': 'NCAR'},\n", " Dataset:\n", - " {'dataset': 'CAMS-CSM1-0',\n", + " {'dataset': 'BCC-ESM1',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -495,9 +857,9 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'CAMS'},\n", + " 'institute': 'BCC'},\n", " Dataset:\n", - " {'dataset': 'MPI-ESM-1-2-HAM',\n", + " {'dataset': 'ICON-ESM-LR',\n", " 'project': 'CMIP6',\n", " 'mip': 'Amon',\n", " 'short_name': 'lwcre',\n", @@ -506,7 +868,7 @@ " 'exp': 'historical',\n", " 'force_derivation': True,\n", " 'grid': 'gn',\n", - " 'institute': 'HAMMOZ-Consortium'}]" + " 'institute': 'MPI-M'}]" ] }, "execution_count": 9, @@ -560,7 +922,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "97cdf12d", "metadata": {}, "outputs": [ @@ -569,9 +931,9 @@ "output_type": "stream", "text": [ "rlut\n", - "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlut/gn/v20190323/rlut_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlut.gn')]\n", + "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlut.gn')]\n", "rlutcs\n", - "[LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_185001-185912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_186001-186912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_187001-187912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_188001-188912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_189001-189912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_190001-190912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_191001-191912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_192001-192912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_193001-193912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_194001-194912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_195001-195912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_196001-196912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_197001-197912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_198001-198912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_199001-199912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_200001-200912.nc'), LocalFile('/mnt/d/data/CMIP6/CMIP/SNU/SAM0-UNICON/historical/r1i1p1f1/Amon/rlutcs/gn/v20190323/rlutcs_Amon_SAM0-UNICON_historical_r1i1p1f1_gn_201001-201412.nc'), IntakeESGFDataset(name='CMIP6.CMIP.SNU.SAM0-UNICON.historical.r1i1p1f1.Amon.rlutcs.gn')]\n" + "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlutcs.gn')]\n" ] } ], From d3adbce48e7fafdc5b983a4e78f7a8a0a3cf416e Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Mon, 19 Jan 2026 19:04:22 +0100 Subject: [PATCH 84/85] Required datasets don't need supplementaries --- esmvalcore/dataset.py | 2 - tests/unit/test_dataset.py | 119 ------------------------------------- 2 files changed, 121 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index d9c781b794..853e86bdaa 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -212,8 +212,6 @@ def _get_required_datasets(self) -> list[Dataset]: for required_facets in required_vars_facets: required_dataset = self._copy(derive=False, force_derivation=False) - for supplementary in self.supplementaries: - required_dataset.supplementaries.append(supplementary.copy()) keep = {"alias", "recipe_dataset_index", *self.minimal_facets} required_dataset.facets = { k: v for k, v in required_dataset.facets.items() if k in keep diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index e605124de7..f70fd551f0 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1622,19 +1622,6 @@ def test_from_files_with_derived(rlut_file, rlutcs_file, session): ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -1754,19 +1741,6 @@ def test_from_files_with_derived_glob( ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -1827,19 +1801,6 @@ def test_from_files_with_derived_glob_differing_timerange( ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2048,20 +2009,6 @@ def test_from_files_with_derived_force_derivation( ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - force_derivation=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2133,20 +2080,6 @@ def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - force_derivation=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2239,19 +2172,6 @@ def test_from_files_with_derived_optional( ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2334,19 +2254,6 @@ def test_from_files_with_derived_glob_optional( ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -2486,19 +2393,6 @@ def test_from_files_with_derived_only_optional(siconca_file, pr_file, session): ), ] for expected_ds in expected_required_datasets: - expected_ds.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_ds.session = session required_datasets = datasets[0].required_datasets @@ -3658,19 +3552,6 @@ def test_required_datasets_derivation(session): ), ] for expected_dataset in expected_datasets: - expected_dataset.supplementaries = [ - Dataset( - **OBS6_SAT_FACETS, - short_name="pr", - derive=False, - frequency="mon", - long_name="Precipitation", - modeling_realm=["atmos"], - original_short_name="pr", - standard_name="precipitation_flux", - units="kg m-2 s-1", - ), - ] expected_dataset.session = dataset.session assert dataset.required_datasets == expected_datasets From 8c055a5f4372b444e7fade80d20dce0ba153d405 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 20 Jan 2026 10:07:20 +0100 Subject: [PATCH 85/85] Make _derivation_necessary faster by avoiding extra calls to dataset.files --- esmvalcore/dataset.py | 46 ++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 853e86bdaa..6d826b7ca2 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -182,25 +182,9 @@ def _is_force_derived(self) -> bool: def _derivation_necessary(self) -> bool: """Return ``True`` if derivation is necessary, ``False`` otherwise.""" - # If variable cannot be derived, derivation is not necessary - if not self._is_derived(): - return False - - # If forced derivation is requested, derivation is necessary - if self._is_force_derived(): - return True - - # Otherwise, derivation is necessary if no files for the self dataset - # are found - ds_copy = self.copy() - ds_copy.supplementaries = [] - - # Avoid potential errors from missing data during timerange glob - # expansion - if _isglob(ds_copy.facets.get("timerange", "")): - ds_copy.facets.pop("timerange", None) - - return not ds_copy.files + return not ( + self.required_datasets and self.required_datasets[0] is self + ) def _get_required_datasets(self) -> list[Dataset]: """Get required datasets for derivation.""" @@ -242,7 +226,29 @@ def required_datasets(self) -> list[Dataset]: if self._required_datasets is not None: return self._required_datasets - if not self._derivation_necessary(): + def _derivation_needed(dataset: Dataset) -> bool: + """Check if derivation is nedeed.""" + # If variable cannot be derived, derivation is not necessary + if not dataset._is_derived(): + return False + + # If forced derivation is requested, derivation is necessary + if dataset._is_force_derived(): + return True + + # Otherwise, derivation is necessary if no files for the self + # dataset are found + ds_copy = dataset.copy() + ds_copy.supplementaries = [] + + # Avoid potential errors from missing data during timerange glob + # expansion + if _isglob(ds_copy.facets.get("timerange", "")): + ds_copy.facets.pop("timerange", None) + + return not ds_copy.files + + if not _derivation_needed(self): self._required_datasets = [self] else: self._required_datasets = self._get_required_datasets()