diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index 1aae4b6aef..b630994682 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -37,7 +37,6 @@ from esmvalcore._task import TaskSet from esmvalcore.dataset import Dataset - from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -504,20 +503,6 @@ def valid_time_selection(timerange: str) -> None: _check_timerange_values(_format_years(date), timerange_list) -def differing_timeranges( - timeranges: set[str], - required_vars: list[Facets], -) -> None: - """Log error if required variables have differing timeranges.""" - if len(timeranges) > 1: - msg = ( - f"Differing timeranges with values {timeranges} " - f"found for required variables {required_vars}. " - "Set `timerange` to a common value." - ) - raise ValueError(msg) - - def _check_literal( settings: dict, *, diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 38f48fc663..d54d35fba5 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -52,8 +52,7 @@ from . import check from .from_datasets import datasets_to_recipe from .to_datasets import ( - _derive_needed, - _get_input_datasets, + _get_required_datasets, _representative_datasets, ) @@ -251,7 +250,7 @@ def _get_default_settings(dataset: Dataset) -> PreprocessorSettings: settings = {} - if _derive_needed(dataset): + if dataset._derivation_necessary(): # noqa: SLF001 (will be replaced soon) settings["derive"] = { "short_name": facets["short_name"], "standard_name": facets["standard_name"], @@ -622,21 +621,26 @@ def _allow_skipping(dataset: Dataset) -> bool: ) -def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None: - """Set the 'version' facet based on derivation input datasets.""" - versions = set() - for in_dataset in input_datasets: - in_dataset.set_version() - if version := in_dataset.facets.get("version"): - if isinstance(version, list): - versions.update(version) - else: - versions.add(version) - if versions: - version = versions.pop() if len(versions) == 1 else sorted(versions) - dataset.set_facet("version", version) - for supplementary_ds in dataset.supplementaries: - supplementary_ds.set_version() +def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: + """Automatically correct the wrong ensemble for CMIP5 fx variables.""" + if ( + dataset.facets.get("project") == "CMIP5" + and dataset.facets.get("mip") == "fx" + and dataset.facets.get("ensemble") != "r0i0p0" + and not dataset.files + ): + original_ensemble = dataset["ensemble"] + copy = dataset.copy() + copy.facets["ensemble"] = "r0i0p0" + if copy.files: + dataset.facets["ensemble"] = "r0i0p0" + logger.info( + "Corrected wrong 'ensemble' from '%s' to '%s' for %s", + original_ensemble, + dataset["ensemble"], + dataset.summary(shorten=True), + ) + dataset.find_files() def _get_preprocessor_products( @@ -662,28 +666,29 @@ def _get_preprocessor_products( settings = _get_default_settings(dataset) _apply_preprocessor_profile(settings, profile) _update_multi_dataset_settings(dataset.facets, settings) + _fix_cmip5_fx_ensemble(dataset) _update_preproc_functions(settings, dataset, datasets, missing_vars) _add_dataset_specific_settings(dataset, settings) check.preprocessor_supplementaries(dataset, settings) - input_datasets = _get_input_datasets(dataset) - missing = _check_input_files(input_datasets) + required_datasets = _get_required_datasets(dataset) + missing = _check_input_files(required_datasets) if missing: if _allow_skipping(dataset): logger.info("Skipping: %s", missing) else: missing_vars.update(missing) continue - _set_version(dataset, input_datasets) + dataset.set_version() USED_DATASETS.append(dataset) - _schedule_for_download(input_datasets) - _log_input_files(input_datasets) + _schedule_for_download(required_datasets) + _log_input_files(required_datasets) logger.info("Found input files for %s", dataset.summary(shorten=True)) filename = _get_preprocessor_filename(dataset) product = PreprocessorFile( filename=filename, attributes=dataset.facets, settings=settings, - datasets=input_datasets, + datasets=required_datasets, ) products.add(product) diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 460b42fca5..baf0cb7246 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -13,7 +13,6 @@ from esmvalcore.exceptions import RecipeError from esmvalcore.io.esgf.facets import FACETS from esmvalcore.io.local import _replace_years_with_timerange -from esmvalcore.preprocessor._derive import get_required from esmvalcore.preprocessor._io import DATASET_KEYS from esmvalcore.preprocessor._supplementary_vars import ( PREPROCESSOR_SUPPLEMENTARIES, @@ -189,28 +188,6 @@ def _merge_supplementary_dicts( return list(merged.values()) -def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None: - """Automatically correct the wrong ensemble for CMIP5 fx variables.""" - if ( - dataset.facets.get("project") == "CMIP5" - and dataset.facets.get("mip") == "fx" - and dataset.facets.get("ensemble") != "r0i0p0" - and not dataset.files - ): - original_ensemble = dataset["ensemble"] - copy = dataset.copy() - copy.facets["ensemble"] = "r0i0p0" - if copy.files: - dataset.facets["ensemble"] = "r0i0p0" - logger.info( - "Corrected wrong 'ensemble' from '%s' to '%s' for %s", - original_ensemble, - dataset["ensemble"], - dataset.summary(shorten=True), - ) - dataset.find_files() - - def _get_supplementary_short_names( facets: Facets, step: str, @@ -434,9 +411,7 @@ def datasets_from_recipe( return datasets -def _dataset_from_files( # noqa: C901 - dataset: Dataset, -) -> list[Dataset]: +def _dataset_from_files(dataset: Dataset) -> list[Dataset]: """Replace facet values of '*' based on available files.""" result: list[Dataset] = [] errors: list[str] = [] @@ -447,53 +422,25 @@ def _dataset_from_files( # noqa: C901 dataset.summary(shorten=True), ) - representative_datasets = _representative_datasets(dataset) - - # For derived variables, representative_datasets might contain more than - # one element - all_datasets: list[list[tuple[dict, Dataset]]] = [] - for representative_dataset in representative_datasets: - all_datasets.append([]) - for expanded_ds in representative_dataset.from_files(): - updated_facets = {} - unexpanded_globs = {} - for key, value in dataset.facets.items(): - if _isglob(value): - if key in expanded_ds.facets and not _isglob( - expanded_ds[key], - ): - updated_facets[key] = expanded_ds.facets[key] - else: - unexpanded_globs[key] = value - - if unexpanded_globs: - msg = _report_unexpanded_globs( - dataset, - expanded_ds, - unexpanded_globs, - ) - errors.append(msg) - continue - - new_ds = dataset.copy() - new_ds.facets.update(updated_facets) - new_ds.supplementaries = expanded_ds.supplementaries - - all_datasets[-1].append((updated_facets, new_ds)) - - # If globs have been expanded, only consider those datasets that contain - # all necessary input variables if derivation is necessary - for updated_facets, new_ds in all_datasets[0]: - other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]] - if all(updated_facets in facets for facets in other_facets): - result.append(new_ds) - else: - logger.debug( - "Not all necessary input variables to derive '%s' are " - "available for dataset %s", - dataset["short_name"], - updated_facets, + # The magic happens in Dataset.from_files. Here, we simply check if any + # wildcards have not been expanded and raise proper errors if necessary. + for expanded_ds in dataset.from_files(): + unexpanded_globs = {} + for key, value in dataset.facets.items(): + if _isglob(value): + if key not in expanded_ds.facets or _isglob(expanded_ds[key]): + unexpanded_globs[key] = value + + if unexpanded_globs: + msg = _report_unexpanded_globs( + dataset, + expanded_ds, + unexpanded_globs, ) + errors.append(msg) + continue + + result.append(expanded_ds) if errors: raise RecipeError("\n".join(errors)) @@ -538,66 +485,33 @@ def _report_unexpanded_globs( return msg -def _derive_needed(dataset: Dataset) -> bool: - """Check if dataset needs to be derived from other datasets.""" - if not dataset.facets.get("derive"): - return False - if dataset.facets.get("force_derivation"): - return True - if _isglob(dataset.facets.get("timerange", "")): - # Our file finding routines are not able to handle globs. - dataset = dataset.copy() - dataset.facets.pop("timerange") - - copy = dataset.copy() - copy.supplementaries = [] - return not copy.files +def _get_required_datasets(dataset: Dataset) -> list[Dataset]: + """Determine the datasets required for deriving `dataset`.""" + if not dataset._derivation_necessary(): # noqa: SLF001 + return dataset.required_datasets - -def _get_input_datasets(dataset: Dataset) -> list[Dataset]: - """Determine the input datasets needed for deriving `dataset`.""" - facets = dataset.facets - if not _derive_needed(dataset): - _fix_cmip5_fx_ensemble(dataset) - return [dataset] - - # Configure input datasets needed to derive variable - datasets = [] - required_vars = get_required(facets["short_name"], facets["project"]) # type: ignore - # idea: add option to specify facets in list of dicts that is value of - # 'derive' in the recipe and use that instead of get_required? - for input_facets in required_vars: - input_dataset = dataset.copy() - keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets} - input_dataset.facets = { - k: v for k, v in input_dataset.facets.items() if k in keep - } - input_dataset.facets.update(input_facets) - input_dataset.augment_facets() - _fix_cmip5_fx_ensemble(input_dataset) - if input_facets.get("optional") and not input_dataset.files: + # Skip optional datasets if no data is available + required_datasets: list[Dataset] = [] + for required_dataset in dataset.required_datasets: + if ( + required_dataset.facets.get("optional") + and not required_dataset.files + ): logger.info( "Skipping: no data found for %s which is marked as 'optional'", - input_dataset, + required_dataset, ) else: - datasets.append(input_dataset) + required_datasets.append(required_dataset) - # Check timeranges of available input data. - timeranges: set[str] = set() - for input_dataset in datasets: - if "timerange" in input_dataset.facets: - timeranges.add(input_dataset.facets["timerange"]) # type: ignore - check.differing_timeranges(timeranges, required_vars) - - return datasets + return required_datasets def _representative_datasets(dataset: Dataset) -> list[Dataset]: """Find representative datasets for all input variables.""" copy = dataset.copy() copy.supplementaries = [] - representative_datasets = _get_input_datasets(copy) + representative_datasets = _get_required_datasets(copy) for representative_dataset in representative_datasets: representative_dataset.supplementaries = dataset.supplementaries return representative_datasets diff --git a/esmvalcore/config/_validated_config.py b/esmvalcore/config/_validated_config.py index 505434c419..7a5af93224 100644 --- a/esmvalcore/config/_validated_config.py +++ b/esmvalcore/config/_validated_config.py @@ -60,9 +60,9 @@ class ValidatedConfig(MutableMapping): """ # validate values on the way in - def __init__(self, *args, **kwargs): + def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__() - self._mapping = {} + self._mapping: dict[str, Any] = {} self.update(*args, **kwargs) def __setitem__(self, key, val): diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 41690dd8a9..6d826b7ca2 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -33,6 +33,7 @@ from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.io.local import _dates_to_timerange from esmvalcore.preprocessor import _get_preprocessor_filename, preprocess +from esmvalcore.preprocessor._derive import get_required if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Sequence @@ -104,7 +105,7 @@ class Dataset: Attributes ---------- - supplementaries : list[Dataset] + supplementaries: list[Dataset] List of supplementary datasets. facets: :obj:`esmvalcore.typing.Facets` Facets describing the dataset. @@ -136,6 +137,7 @@ def __init__(self, **facets: FacetValue) -> None: self._session: Session | None = None self._files: Sequence[DataElement] | None = None self._used_data_sources: Sequence[DataSource] = [] + self._required_datasets: list[Dataset] | None = None for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -180,43 +182,103 @@ def _is_force_derived(self) -> bool: def _derivation_necessary(self) -> bool: """Return ``True`` if derivation is necessary, ``False`` otherwise.""" - # If variable cannot be derived, derivation is not necessary - if not self._is_derived(): - return False + return not ( + self.required_datasets and self.required_datasets[0] is self + ) - # If forced derivation is requested, derivation is necessary - if self._is_force_derived(): - return True + def _get_required_datasets(self) -> list[Dataset]: + """Get required datasets for derivation.""" + required_datasets: list[Dataset] = [] + required_vars_facets = get_required( + self.facets["short_name"], # type: ignore + self.facets["project"], # type: ignore + ) + + for required_facets in required_vars_facets: + required_dataset = self._copy(derive=False, force_derivation=False) + keep = {"alias", "recipe_dataset_index", *self.minimal_facets} + required_dataset.facets = { + k: v for k, v in required_dataset.facets.items() if k in keep + } + required_dataset.facets.update(required_facets) + required_dataset.augment_facets() + required_datasets.append(required_dataset) + + return required_datasets + + @property + def required_datasets(self) -> list[Dataset]: + """Get required datasets. + + For non-derived variables (i.e., those without a ``derive`` facet or + with facet ``derive=False``), this will simply return the dataset + itself in a list. + + For derived variables (i.e., those with facet ``derive=True``), this + will return the datasets required for derivation if derivation is + necessary, and the dataset itself if derivation is not necessary. + Derivation is necessary if the facet ``force_derivation=True`` is set + or no files for the dataset itself are available. - # Otherwise, derivation is necessary of no files for the self dataset - # are found - ds_copy = self.copy() - ds_copy.supplementaries = [] - return not ds_copy.files + See also :func:`esmvalcore.preprocessor.derive` for an example usage. + + """ + if self._required_datasets is not None: + return self._required_datasets + + def _derivation_needed(dataset: Dataset) -> bool: + """Check if derivation is nedeed.""" + # If variable cannot be derived, derivation is not necessary + if not dataset._is_derived(): + return False + + # If forced derivation is requested, derivation is necessary + if dataset._is_force_derived(): + return True + + # Otherwise, derivation is necessary if no files for the self + # dataset are found + ds_copy = dataset.copy() + ds_copy.supplementaries = [] + + # Avoid potential errors from missing data during timerange glob + # expansion + if _isglob(ds_copy.facets.get("timerange", "")): + ds_copy.facets.pop("timerange", None) + + return not ds_copy.files + + if not _derivation_needed(self): + self._required_datasets = [self] + else: + self._required_datasets = self._get_required_datasets() + return self._required_datasets + + @staticmethod def _file_to_dataset( - self, + dataset: Dataset, file: DataElement, ) -> Dataset: """Create a dataset from a file with a `facets` attribute.""" facets = dict(file.facets) - if "version" not in self.facets: + if "version" not in dataset.facets: # Remove version facet if no specific version requested facets.pop("version", None) updated_facets = { f: v for f, v in facets.items() - if f in self.facets - and _isglob(self.facets[f]) - and _ismatch(v, self.facets[f]) + if f in dataset.facets + and _isglob(dataset.facets[f]) + and _ismatch(v, dataset.facets[f]) } - dataset = self.copy() - dataset.facets.update(updated_facets) + new_dataset = dataset.copy() + new_dataset.facets.update(updated_facets) # If possible, remove unexpanded facets that can be automatically # populated. - unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)} + unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)} required_for_augment = { "project", "mip", @@ -225,60 +287,139 @@ def _file_to_dataset( "dataset", } if unexpanded and not unexpanded & required_for_augment: - copy = dataset.copy() + copy = new_dataset.copy() copy.supplementaries = [] for facet in unexpanded: copy.facets.pop(facet) copy.augment_facets() for facet in unexpanded: if facet in copy.facets: - dataset.facets.pop(facet) + new_dataset.facets.pop(facet) - return dataset + return new_dataset - def _get_available_datasets(self) -> Iterator[Dataset]: + @staticmethod + def _get_expanded_globs( + dataset_with_globs: Dataset, + dataset_with_expanded_globs: Dataset, + ) -> tuple[tuple[str, FacetValue], ...]: + """Get facets that have been updated by expanding globs.""" + expanded_globs: dict[str, FacetValue] = {} + for key, value in dataset_with_globs.facets.items(): + if ( + _isglob(value) + and key in dataset_with_expanded_globs.facets + and not _isglob(dataset_with_expanded_globs[key]) + ): + expanded_globs[key] = dataset_with_expanded_globs[key] + return tuple(expanded_globs.items()) + + @staticmethod + def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]: + """Yield datasets based on the available files. + + This function requires that dataset.facets['mip'] is not a glob + pattern. + + Does take variable derivation into account, i.e., datasets available + through variable derivation are returned. + + """ + if not dataset._derivation_necessary(): + yield from Dataset._get_available_datasets(dataset) + return + + # Since we are in full control of the derived variables (the module is + # private; no custom derivation functions are possible), we can be sure + # that the following list is never empty + non_optional_datasets = [ + d + for d in dataset.required_datasets + if not d.facets.get("optional", False) + ] + if not non_optional_datasets: + msg = ( + f"Using wildcards to derive {dataset.summary(shorten=True)} " + f"is not possible, derivation function only requires optional " + f"variables" + ) + raise RecipeError(msg) + + # Record all expanded globs from first non-optional required dataset + # (called "reference_dataset" hereafter) + reference_dataset = non_optional_datasets[0] + reference_expanded_globs = { + Dataset._get_expanded_globs(dataset, ds) + for ds in Dataset._get_available_datasets(reference_dataset) + } + + # Iterate through all other non-optional required datasets and only + # keep those expanded globs which are present for all other + # non-optional required datasets + for required_dataset in non_optional_datasets: + if required_dataset is reference_dataset: + continue + new_expanded_globs = { + Dataset._get_expanded_globs(dataset, ds) + for ds in Dataset._get_available_datasets(required_dataset) + } + reference_expanded_globs &= new_expanded_globs + + # Use the final expanded globs to create new dataset(s) + for expanded_globs in reference_expanded_globs: + new_ds = dataset.copy() + new_ds.facets.update(dict(expanded_globs)) + yield new_ds + + @staticmethod + def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]: """Yield datasets based on the available files. This function requires that self.facets['mip'] is not a glob pattern. + + Does not take variable derivation into account, i.e., datasets + potentially available through variable derivation are ignored. To + consider derived variables properly, use the function + :func:`_get_all_available_datasets`. + """ - dataset_template = self.copy() + dataset_template = dataset.copy() dataset_template.supplementaries = [] seen = set() partially_defined = [] expanded = False for file in dataset_template.files: - dataset = self._file_to_dataset(file) - # Do not use the timerange facet from the file because there may be multiple - # files per dataset. - dataset.facets.pop("timerange", None) + new_dataset = Dataset._file_to_dataset(dataset, file) + # Do not use the timerange facet from the file because there may be + # multiple files per dataset. + new_dataset.facets.pop("timerange", None) # Restore the original timerange facet if it was specified. - if "timerange" in self.facets: - dataset.facets["timerange"] = self.facets["timerange"] + if "timerange" in dataset.facets: + new_dataset.facets["timerange"] = dataset.facets["timerange"] # Filter out identical datasets facetset = frozenset( (f, frozenset(v) if isinstance(v, list) else v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() ) if facetset not in seen: seen.add(facetset) if any( _isglob(v) - for f, v in dataset.facets.items() + for f, v in new_dataset.facets.items() if f != "timerange" ): - partially_defined.append((dataset, file)) + partially_defined.append((new_dataset, file)) else: - dataset._update_timerange() # noqa: SLF001 - dataset._supplementaries_from_files() # noqa: SLF001 + new_dataset._update_timerange() # noqa: SLF001 expanded = True - yield dataset + yield new_dataset # Only yield datasets with globs if there is no better alternative - for dataset, file in partially_defined: + for new_dataset, file in partially_defined: msg = ( - f"{dataset} with unexpanded wildcards, created from file " + f"{new_dataset} with unexpanded wildcards, created from file " f"{file} with facets {file.facets}. Please check why " "the missing facets are not available for the file." "This will depend on the data source they come from, e.g. can " @@ -293,7 +434,7 @@ def _get_available_datasets(self) -> Iterator[Dataset]: "because it still contains wildcards.", msg, ) - yield dataset + yield new_dataset def from_files(self) -> Iterator[Dataset]: """Create datasets based on the available files. @@ -317,6 +458,10 @@ def from_files(self) -> Iterator[Dataset]: Supplementary datasets will in inherit the facet values from the main dataset for those facets listed in :obj:`INHERITED_FACETS`. + This also works for :ref:`derived variables `. The + datasets required for derivation can be accessed via + :attr:`Dataset.required_datasets`. + Examples -------- See :doc:`/notebooks/discovering-data` notebook for example use cases. @@ -326,52 +471,66 @@ def from_files(self) -> Iterator[Dataset]: Dataset Datasets representing the available files. """ + # No wildcards present -> simply return self with expanded + # supplementaries + if not any(_isglob(v) for v in self.facets.values()): + self._supplementaries_from_files() + yield self + return + + # Wildcards present -> expand them expanded = False - if any(_isglob(v) for v in self.facets.values()): - if _isglob(self.facets["mip"]): - available_mips = _get_mips( - self.facets["project"], # type: ignore - self.facets["short_name"], # type: ignore + if _isglob(self.facets["mip"]): + available_mips = _get_mips( + self.facets["project"], # type: ignore + self.facets["short_name"], # type: ignore + ) + mips = [ + mip + for mip in available_mips + if _ismatch(mip, self.facets["mip"]) + ] + else: + mips = [self.facets["mip"]] # type: ignore + + for mip in mips: + if _isglob(self.facets.get("branding_suffix", "")): + available_branding_suffixes = _get_branding_suffixes( + project=self.facets["project"], # type: ignore[arg-type] + mip=mip, + short_name=self.facets["short_name"], # type: ignore[arg-type] ) - mips = [ - mip - for mip in available_mips - if _ismatch(mip, self.facets["mip"]) + branding_suffixes = [ + branding_suffix + for branding_suffix in available_branding_suffixes + if _ismatch( + branding_suffix, + self.facets["branding_suffix"], + ) + ] + dataset_templates = [ + self.copy(mip=mip, branding_suffix=branding_suffix) + for branding_suffix in branding_suffixes ] else: - mips = [self.facets["mip"]] # type: ignore - - for mip in mips: - if _isglob(self.facets.get("branding_suffix", "")): - available_branding_suffixes = _get_branding_suffixes( - project=self.facets["project"], # type: ignore[arg-type] - mip=mip, - short_name=self.facets["short_name"], # type: ignore[arg-type] - ) - branding_suffixes = [ - branding_suffix - for branding_suffix in available_branding_suffixes - if _ismatch( - branding_suffix, - self.facets["branding_suffix"], - ) - ] - dataset_templates = [ - self.copy(mip=mip, branding_suffix=branding_suffix) - for branding_suffix in branding_suffixes - ] - else: - dataset_templates = [self.copy(mip=mip)] - for dataset_template in dataset_templates: - for dataset in dataset_template._get_available_datasets(): # noqa: SLF001 - expanded = True - yield dataset + dataset_templates = [self.copy(mip=mip)] + for dataset_template in dataset_templates: + for dataset in self._get_all_available_datasets( + dataset_template, + ): + dataset._supplementaries_from_files() # noqa: SLF001 + expanded = True + yield dataset + # If files were found, or the file facets didn't match the + # specification, yield the original, but do expand any supplementary + # globs. For derived variables, make sure to purge any files found for + # required variables; those won't match in their facets. if not expanded: - # If the definition contains no wildcards, no files were found, - # or the file facets didn't match the specification, yield the - # original, but do expand any supplementary globs. self._supplementaries_from_files() + if self._derivation_necessary(): + for required_dataset in self.required_datasets: + required_dataset.files = [] yield self def _supplementaries_from_files(self) -> None: @@ -638,15 +797,29 @@ def minimal_facets(self) -> Facets: """Return a dictionary with the persistent facets.""" return {k: v for k, v in self.facets.items() if k in self._persist} + @staticmethod + def _get_version(dataset: Dataset) -> str | list[str]: + """Get available version(s) of dataset.""" + versions: set[str] = set() + for file in dataset.files: + if "version" in file.facets: + versions.add(str(file.facets["version"])) + return versions.pop() if len(versions) == 1 else sorted(versions) + def set_version(self) -> None: """Set the ``'version'`` facet based on the available data.""" versions: set[str] = set() - for file in self.files: - if "version" in file.facets: - versions.add(file.facets["version"]) # type: ignore + for required_dataset in self.required_datasets: + version = self._get_version(required_dataset) + if version: + if isinstance(version, list): + versions.update(version) + else: + versions.add(version) version = versions.pop() if len(versions) == 1 else sorted(versions) if version: self.set_facet("version", version) + for supplementary_ds in self.supplementaries: supplementary_ds.set_version() @@ -1007,8 +1180,9 @@ def _update_timerange(self) -> None: dataset = self.copy() dataset.facets.pop("timerange") dataset.supplementaries = [] - check.data_availability(dataset) - if all("timerange" in f.facets for f in dataset.files): + if dataset.files and all( + "timerange" in f.facets for f in dataset.files + ): # "timerange" can only be reliably computed when all DataElements # provide it. intervals = [ diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py index 3a25d1f9a4..3817b86bc7 100644 --- a/esmvalcore/preprocessor/_derive/__init__.py +++ b/esmvalcore/preprocessor/_derive/__init__.py @@ -13,6 +13,8 @@ from esmvalcore.preprocessor._units import convert_units if TYPE_CHECKING: + from collections.abc import Sequence + from cf_units import Unit from iris.cube import Cube @@ -77,7 +79,7 @@ def get_required(short_name: str, project: str) -> list[Facets]: def derive( - cubes: CubeList, + cubes: Sequence[Cube], short_name: str, long_name: str, units: str | Unit, @@ -88,8 +90,7 @@ def derive( Parameters ---------- cubes: - Includes all the needed variables for derivation defined in - :func:`get_required`. + Includes all the needed variables for derivation. short_name: short_name long_name: @@ -103,6 +104,38 @@ def derive( ------- iris.cube.Cube The new derived variable. + + Examples + -------- + Required variables for derivation can be obtained via + :attr:`esmvalcore.dataset.Dataset.required_datasets`. + + For example, to derive the longwave cloud radiative effect (LWCRE) for the + model CESM2, you can use: + + >>> from esmvalcore.dataset import Dataset + >>> from esmvalcore.preprocessor import derive + >>> dataset = Dataset( + ... project="CMIP6", + ... dataset="CESM2", + ... exp="historical", + ... ensemble="r1i1p1f1", + ... grid="gn", + ... timerange="2000/2014", + ... short_name="lwcre", + ... mip="Amon", + ... derive=True, + ... ) + >>> cubes = [d.load() for d in dataset.required_datasets] + >>> cube = derive( + ... cubes, + ... short_name="lwcre", + ... long_name="TOA Longwave Cloud Radiative Effect", + ... units="W m-2", + ... ) + >>> print(cube.var_name) + lwcre # doctest: +SKIP + """ if short_name == cubes[0].var_name: return cubes[0] diff --git a/esmvalcore/preprocessor/_derive/amoc.py b/esmvalcore/preprocessor/_derive/amoc.py index 3607aa1d62..67b179f0dd 100644 --- a/esmvalcore/preprocessor/_derive/amoc.py +++ b/esmvalcore/preprocessor/_derive/amoc.py @@ -72,9 +72,7 @@ def calculate(cubes): f"Amoc calculation: {cube_orig} doesn't contain" f" atlantic_arctic_ocean." ) - raise ValueError( - msg, - ) + raise ValueError(msg) # 2: Remove the shallowest 500m to avoid wind driven mixed layer. depth_constraint = iris.Constraint(depth=lambda d: d >= 500.0) diff --git a/esmvalcore/preprocessor/_derive/siextent.py b/esmvalcore/preprocessor/_derive/siextent.py index 27aee25aec..5bd2ca82f1 100644 --- a/esmvalcore/preprocessor/_derive/siextent.py +++ b/esmvalcore/preprocessor/_derive/siextent.py @@ -20,8 +20,8 @@ class DerivedVariable(DerivedVariableBase): def required(project): # noqa: ARG004 """Declare the variables needed for derivation.""" return [ - {"short_name": "sic", "optional": "true"}, - {"short_name": "siconca", "optional": "true"}, + {"short_name": "sic", "optional": True}, + {"short_name": "siconca", "optional": True}, ] @staticmethod @@ -53,9 +53,7 @@ def calculate(cubes): "Derivation of siextent failed due to missing variables " "sic and siconca." ) - raise RecipeError( - msg, - ) from exc + raise RecipeError(msg) from exc ones = da.ones_like(sic) siextent_data = da.ma.masked_where(sic.lazy_data() < 15.0, ones) diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb index d6c9001ef2..10676a8b72 100644 --- a/notebooks/discovering-data.ipynb +++ b/notebooks/discovering-data.ipynb @@ -13,14 +13,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "f0ccfe7f-c535-4606-99ce-be24960aece1", "metadata": {}, "outputs": [], "source": [ "from esmvalcore.config import CFG\n", - "from esmvalcore.dataset import Dataset\n", - "from esmvalcore.esgf import download" + "from esmvalcore.dataset import Dataset" ] }, { @@ -39,7 +38,33 @@ "metadata": {}, "outputs": [], "source": [ - "CFG[\"search_esgf\"] = \"always\"" + "CFG[\"search_data\"] = \"complete\"\n", + "CFG[\"projects\"].pop(\"CMIP6\", None) # Clear existing CMIP6 configuration\n", + "CFG.nested_update(\n", + " {\n", + " \"projects\": {\n", + " \"CMIP6\": {\n", + " \"data\": {\n", + " \"intake-esgf\": {\n", + " \"type\": \"esmvalcore.io.intake_esgf.IntakeESGFDataSource\",\n", + " \"priority\": 2,\n", + " \"facets\": {\n", + " \"activity\": \"activity_drs\",\n", + " \"dataset\": \"source_id\",\n", + " \"ensemble\": \"member_id\",\n", + " \"exp\": \"experiment_id\",\n", + " \"institute\": \"institution_id\",\n", + " \"grid\": \"grid_label\",\n", + " \"mip\": \"table_id\",\n", + " \"project\": \"project\",\n", + " \"short_name\": \"variable_id\",\n", + " },\n", + " },\n", + " },\n", + " },\n", + " },\n", + " },\n", + ")" ] }, { @@ -89,7 +114,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found 778 datasets, showing the first 10:\n" + "Found 906 datasets, showing the first 10:\n" ] }, { @@ -253,7 +278,7 @@ { "data": { "text/plain": [ - "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]" + "[IntakeESGFDataset(name='CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn')]" ] }, "execution_count": 6, @@ -270,7 +295,7 @@ "id": "60d88a34-c886-4b9d-a9e9-a9d18fa97917", "metadata": {}, "source": [ - "A single file can be downloaded using its `download` method:" + "Load a single file as `iris.cube.CubeList`:" ] }, { @@ -281,8 +306,402 @@ "outputs": [ { "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "
Air Temperature (K)timelatitudelongitude
Shape1980192288
Dimension coordinates
\ttimex--
\tlatitude-x-
\tlongitude--x
Scalar coordinates
\theight2.0 m
Cell methods
\t0area: time: mean
Attributes
\tConventions'CF-1.7 CMIP-6.2'
\tactivity_drs'CMIP'
\tactivity_id'CMIP'
\tbranch_method'Hybrid-restart from year 0671-01-01 of piControl'
\tbranch_timenp.float64(0.0)
\tbranch_time_in_childnp.float64(0.0)
\tbranch_time_in_parentnp.float64(171550.0)
\tcmor_version'3.5.0'
\tcomment'near-surface (usually, 2 meter) air temperature'
\tcontact'Dr. Wei-Liang Lee (leelupin@gate.sinica.edu.tw)'
\tcreation_date'2020-06-08T08:53:23Z'
\tdata_specs_version'01.00.31'
\texperiment'all-forcing simulation of the recent past'
\texperiment_id'historical'
\texternal_variables'areacella'
\tforcing_indexnp.int32(1)
\tfrequency'mon'
\tfurther_info_url'https://furtherinfo.es-doc.org/CMIP6.AS-RCEC.TaiESM1.historical.none.r ...'
\tgrid'finite-volume grid with 0.9x1.25 degree lat/lon resolution'
\tgrid_label'gn'
\thistory"2020-06-08T08:53:23Z altered by CMOR: Treated scalar dimension: 'height'. ..."
\tinitialization_indexnp.int32(1)
\tinstitution'Research Center for Environmental Changes, Academia Sinica, Nankang, Taipei ...'
\tinstitution_id'AS-RCEC'
\tlicense'CMIP6 model data produced by NCC is licensed under a Creative Commons Attribution ...'
\tmember_id'r1i1p1f1'
\tmip_era'CMIP6'
\tmodel_id'TaiESM1'
\tnominal_resolution'100 km'
\toriginal_name'TREFHT'
\tparent_activity_id'CMIP'
\tparent_experiment_id'piControl'
\tparent_mip_era'CMIP6'
\tparent_source_id'TaiESM1'
\tparent_sub_experiment_id'none'
\tparent_time_units'days since 1850-01-01'
\tparent_variant_label'r1i1p1f1'
\tphysics_indexnp.int32(1)
\tproduct'model-output'
\trealization_indexnp.int32(1)
\trealm'atmos'
\treferences'10.5194/gmd-2019-377'
\trun_variant'N/A'
\tsource'TaiESM 1.0 (2018): \\naerosol: SNAP (same grid as atmos)\\natmos: TaiAM1 ...'
\tsource_file'/mnt/d/data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn ...'
\tsource_id'TaiESM1'
\tsource_type'AOGCM AER BGC'
\tsub_experiment'none'
\tsub_experiment_id'none'
\ttable_id'Amon'
\ttable_info'Creation Date:(24 July 2019) MD5:0bb394a356ef9d214d027f1aca45853e'
\ttitle'TaiESM1 output prepared for CMIP6'
\ttracking_id'hdl:21.14100/997cf563-6411-4a78-a9c4-7369ae27d698'
\tvariable_id'tas'
\tvariant_label'r1i1p1f1'
\n", + "

\n", + "
\n", + " \n", + " " + ], "text/plain": [ - "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')" + "[]" ] }, "execution_count": 7, @@ -291,32 +710,243 @@ } ], "source": [ - "dataset.files[0].download(CFG[\"download_dir\"])" + "cubes = dataset.files[0].to_iris()\n", + "cubes" ] }, { - "attachments": {}, "cell_type": "markdown", - "id": "3821b594-3797-497b-a51d-1798d5b2fc80", + "id": "d3006d90", "metadata": {}, "source": [ - "For downloading many files, the [esmvalcore.esgf.download](https://docs.esmvaltool.org/projects/esmvalcore/en/latest/api/esmvalcore.esgf.html#esmvalcore.esgf.download) function is recommended because it will download the files in parallel. The ESMValCore will try to guess the fastest host and download from there. If it is not available for some reason, it will automatically fall back to the next host." + "`Dataset.from_files` can also handle derived variables properly:" ] }, { "cell_type": "code", "execution_count": 8, - "id": "9676ff81-232e-4ff8-b784-686f0d06c469", + "id": "b75314e3", "metadata": {}, "outputs": [], "source": [ - "download(dataset.files, CFG[\"download_dir\"])" + "dataset_template = Dataset(\n", + " short_name=\"lwcre\",\n", + " mip=\"Amon\",\n", + " project=\"CMIP6\",\n", + " exp=\"historical\",\n", + " dataset=\"*\",\n", + " institute=\"*\",\n", + " ensemble=\"r1i1p1f1\",\n", + " grid=\"gn\",\n", + " derive=True,\n", + " force_derivation=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b87c247f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 37 datasets, showing the first 10:\n" + ] + }, + { + "data": { + "text/plain": [ + "[Dataset:\n", + " {'dataset': 'GISS-E2-2-G',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'NASA-GISS'},\n", + " Dataset:\n", + " {'dataset': 'FGOALS-g3',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'CESM2-WACCM-FV2',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'NCAR'},\n", + " Dataset:\n", + " {'dataset': 'GISS-E2-1-H',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'NASA-GISS'},\n", + " Dataset:\n", + " {'dataset': 'BCC-CSM2-MR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'CAS-ESM2-0',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'CAS'},\n", + " Dataset:\n", + " {'dataset': 'MPI-ESM-1-2-HAM',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'HAMMOZ-Consortium'},\n", + " Dataset:\n", + " {'dataset': 'CESM2-FV2',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'NCAR'},\n", + " Dataset:\n", + " {'dataset': 'BCC-ESM1',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'BCC'},\n", + " Dataset:\n", + " {'dataset': 'ICON-ESM-LR',\n", + " 'project': 'CMIP6',\n", + " 'mip': 'Amon',\n", + " 'short_name': 'lwcre',\n", + " 'derive': True,\n", + " 'ensemble': 'r1i1p1f1',\n", + " 'exp': 'historical',\n", + " 'force_derivation': True,\n", + " 'grid': 'gn',\n", + " 'institute': 'MPI-M'}]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datasets = list(dataset_template.from_files())\n", + "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n", + "datasets[:10]" + ] + }, + { + "cell_type": "markdown", + "id": "18e3a0b7", + "metadata": {}, + "source": [ + "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned." + ] + }, + { + "cell_type": "markdown", + "id": "f00a886f", + "metadata": {}, + "source": [ + "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c5edfa65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = datasets[0]\n", + "dataset.files" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "97cdf12d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rlut\n", + "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlut.gn')]\n", + "rlutcs\n", + "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlutcs.gn')]\n" + ] + } + ], + "source": [ + "for d in dataset.required_datasets:\n", + " print(d[\"short_name\"])\n", + " print(d.files)" ] } ], "metadata": { "kernelspec": { - "display_name": "esm", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -330,12 +960,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "vscode": { - "interpreter": { - "hash": "17e81e49408864327be43d3caebcb8eca32ff92a01becb15aa27be73c37f0517" - } + "version": "3.13.11" } }, "nbformat": 4, diff --git a/tests/conftest.py b/tests/conftest.py index 46cabf58f9..3c19e4c4df 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib import warnings from copy import deepcopy from functools import lru_cache @@ -7,6 +8,7 @@ import numpy as np import pytest +import yaml from cf_units import Unit from iris.coords import ( AncillaryVariable, @@ -17,6 +19,7 @@ ) from iris.cube import Cube +import esmvalcore from esmvalcore.config import CFG, Config if TYPE_CHECKING: @@ -55,6 +58,33 @@ def ignore_existing_user_config( monkeypatch.setattr(CFG, "_mapping", cfg_default._mapping) +@lru_cache +def _load_default_data_sources() -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load default data sources for local users.""" + cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { + "projects": {}, + } + for file in ( + "data-local.yml", + "data-local-esmvaltool.yml", + "data-native-cesm.yml", + "data-native-emac.yml", + "data-native-icon.yml", + "data-native-ipslcm.yml", + ): + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / file, + ) as config_file: + content = config_file.read_text(encoding="utf-8") + cfg["projects"].update(yaml.safe_load(content)["projects"]) + return cfg + + @pytest.fixture def session( tmp_path: Path, @@ -63,7 +93,15 @@ def session( ) -> Session: """Session object with default settings.""" monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output") - return CFG.start_session("recipe_test") + session = CFG.start_session("recipe_test") + projects = _load_default_data_sources()["projects"] + for project in projects: + print(project) + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(tmp_path) + session["projects"][project]["data"] = data_sources + return session @pytest.fixture diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index 44171a76ad..e801789e70 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -272,27 +272,6 @@ def test_valid_time_selection_rejections(timerange, message): assert str(rec_err.value) == message -def test_differing_timeranges(caplog): - timeranges = set() - timeranges.add("1950/1951") - timeranges.add("1950/1952") - required_variables = [ - {"short_name": "rsdscs", "timerange": "1950/1951"}, - {"short_name": "rsuscs", "timerange": "1950/1952"}, - ] - with pytest.raises(ValueError) as exc: - check.differing_timeranges(timeranges, required_variables) - expected_log = ( - f"Differing timeranges with values {timeranges} " - "found for required variables " - "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, " - "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. " - "Set `timerange` to a common value." - ) - - assert expected_log in str(exc.value) - - def test_data_availability_nonexistent(tmp_path): var = { "dataset": "ABC", diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index b87a696387..565f347b3b 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -22,7 +22,7 @@ import esmvalcore.io.esgf import esmvalcore.io.local from esmvalcore._recipe.recipe import ( - _get_input_datasets, + _get_required_datasets, _representative_datasets, read_recipe_file, ) @@ -182,7 +182,7 @@ def get_required(short_name, _): ] monkeypatch.setattr( - esmvalcore._recipe.to_datasets, + esmvalcore.dataset, "get_required", get_required, ) @@ -1707,7 +1707,7 @@ def test_alias_generation(tmp_path, patched_datafinder, session): # noqa: C901, assert dataset["alias"] == "CORDEX_ICHEC-EC-EARTH" else: assert dataset["alias"] == "CORDEX_MIROC-MIROC5" - elif dataset["version"] == 1: + elif dataset["version"] == "1": assert dataset["alias"] == "OBS_1" else: assert dataset["alias"] == "OBS_2" @@ -2599,9 +2599,7 @@ def test_representative_dataset_derived_var( expected_facets: Facets = { # Already present in variable "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": force_derivation, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2611,6 +2609,9 @@ def test_representative_dataset_derived_var( "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_required_datasets() + "derive": False, + "force_derivation": False, } if force_derivation: expected_datasets = [ @@ -2665,9 +2666,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsdscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2680,6 +2679,9 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_required_datasets() + "derive": False, + "force_derivation": False, } rsdscs = Dataset(**rsdscs_facets) rsdscs.session = session @@ -2689,9 +2691,7 @@ def test_get_derive_input_variables(patched_datafinder, session): "short_name": "rsuscs", # Already present in variables "dataset": "ICON", - "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", - "force_derivation": True, "frequency": "mon", "mip": "Amon", "project": "ICON", @@ -2704,11 +2704,14 @@ def test_get_derive_input_variables(patched_datafinder, session): "units": "W m-2", # Added by _add_extra_facets "var_type": "atm_2d_ml", + # Added/changed by Dataset._get_required_datasets() + "derive": False, + "force_derivation": False, } rsuscs = Dataset(**rsuscs_facets) rsuscs.session = session - alb_derive_input = _get_input_datasets(alb) + alb_derive_input = _get_required_datasets(alb) assert alb_derive_input == [rsdscs, rsuscs] diff --git a/tests/unit/preprocessor/_derive/test_siextent.py b/tests/unit/preprocessor/_derive/test_siextent.py index ae9f5d1c8f..416c9ac17b 100644 --- a/tests/unit/preprocessor/_derive/test_siextent.py +++ b/tests/unit/preprocessor/_derive/test_siextent.py @@ -113,6 +113,6 @@ def test_siextent_required(): derived_var = siextent.DerivedVariable() output = derived_var.required(None) assert output == [ - {"short_name": "sic", "optional": "true"}, - {"short_name": "siconca", "optional": "true"}, + {"short_name": "sic", "optional": True}, + {"short_name": "siconca", "optional": True}, ] diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 6ed350c34d..fd19bc21a3 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -865,28 +865,6 @@ def test_get_default_settings(mocker): } -def test_set_version(mocker): - dataset = Dataset(short_name="tas") - supplementary = Dataset(short_name="areacella") - dataset.supplementaries = [supplementary] - - input_dataset = Dataset(short_name="tas") - file1 = mocker.Mock() - file1.facets = {"version": "v1"} - file2 = mocker.Mock() - file2.facets = {"version": "v2"} - input_dataset.files = [file1, file2] - - file3 = mocker.Mock() - file3.facets = {"version": "v3"} - supplementary.files = [file3] - - _recipe._set_version(dataset, [input_dataset]) - print(dataset) - assert dataset.facets["version"] == ["v1", "v2"] - assert dataset.supplementaries[0].facets["version"] == "v3" - - def test_extract_preprocessor_order(): profile = { "custom_order": True, @@ -956,3 +934,23 @@ def test_special_name_to_dataset_invalid_special_name_type(): ) with pytest.raises(RecipeError, match=msg): _recipe._special_name_to_dataset(facets, "reference_dataset") + + +def test_fix_cmip5_fx_ensemble(monkeypatch): + def find_files(self): + if self.facets["ensemble"] == "r0i0p0": + self._files = ["file1.nc"] + + monkeypatch.setattr(Dataset, "find_files", find_files) + + dataset = Dataset( + dataset="dataset1", + short_name="orog", + mip="fx", + project="CMIP5", + ensemble="r1i1p1", + ) + + _recipe._fix_cmip5_fx_ensemble(dataset) + + assert dataset["ensemble"] == "r0i0p0" diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index c1834d002e..877d7681e8 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import textwrap from pathlib import Path from typing import TYPE_CHECKING @@ -284,7 +285,7 @@ def test_merge_supplementaries_missing_short_name_fails(session): Dataset.from_recipe(recipe_txt, session) -def test_get_input_datasets_derive(session): +def test_get_required_datasets_derive(session): dataset = Dataset( dataset="ERA5", project="native6", @@ -299,7 +300,7 @@ def test_get_input_datasets_derive(session): type="reanaly", version="v1", ) - rlds, rlns = to_datasets._get_input_datasets(dataset) + rlds, rlns = to_datasets._get_required_datasets(dataset) assert rlds["short_name"] == "rlds" assert rlds["long_name"] == "Surface Downwelling Longwave Radiation" assert rlds["frequency"] == "1hr" @@ -308,6 +309,57 @@ def test_get_input_datasets_derive(session): assert rlns["frequency"] == "1hr" +def test_get_required_datasets_optional(caplog, tmp_path, session): + facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "SImon", + "short_name": "siextent", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", + "derive": True, + } + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + sic_file = LocalFile( + input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc", + ) + sic_file.touch() + + dataset = Dataset(**facets) + dataset.files = [] + dataset.session = session + + with caplog.at_level(logging.INFO): + datasets = to_datasets._get_required_datasets(dataset) + + expected = Dataset( + dataset="SAT", + project="OBS6", + mip="SImon", + short_name="siconca", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Atmospheric Grid)", + modeling_realm=["seaIce"], + optional=True, + original_short_name="siconca", + standard_name="sea_ice_area_fraction", + tier=2, + timerange="1980/2000", + type="sat", + units="%", + ) + expected.session = session + + assert datasets == [expected] + + logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"] + assert "which is marked as 'optional'" in logger_infos[-1] + + def test_max_years(session): recipe_txt = textwrap.dedent(""" diagnostics: @@ -355,26 +407,6 @@ def from_files(_): to_datasets._dataset_from_files(dataset) -def test_fix_cmip5_fx_ensemble(monkeypatch): - def find_files(self): - if self.facets["ensemble"] == "r0i0p0": - self._files = ["file1.nc"] - - monkeypatch.setattr(Dataset, "find_files", find_files) - - dataset = Dataset( - dataset="dataset1", - short_name="orog", - mip="fx", - project="CMIP5", - ensemble="r1i1p1", - ) - - to_datasets._fix_cmip5_fx_ensemble(dataset) - - assert dataset["ensemble"] == "r0i0p0" - - def test_get_supplementary_short_names(monkeypatch): def _update_cmor_facets(facets): facets["modeling_realm"] = "atmos" diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 90e5ff50fb..f70fd551f0 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,15 +1,12 @@ from __future__ import annotations -import importlib.resources import textwrap from collections import defaultdict -from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING import pyesgf import pytest -import yaml import esmvalcore.dataset import esmvalcore.io.esgf @@ -19,6 +16,7 @@ from esmvalcore.dataset import Dataset from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.io.esgf import ESGFFile +from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase if TYPE_CHECKING: from pytest_mock import MockerFixture @@ -26,45 +24,6 @@ from esmvalcore.typing import Facets -@lru_cache -def _load_default_data_sources() -> dict[ - str, - dict[str, dict[str, dict[str, dict[str, str]]]], -]: - """Load default data sources for local users.""" - cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { - "projects": {}, - } - for file in ( - "data-local.yml", - "data-local-esmvaltool.yml", - "data-native-cesm.yml", - "data-native-emac.yml", - "data-native-icon.yml", - "data-native-ipslcm.yml", - ): - with importlib.resources.as_file( - importlib.resources.files(esmvalcore.config) - / "configurations" - / file, - ) as config_file: - content = config_file.read_text(encoding="utf-8") - cfg["projects"].update(yaml.safe_load(content)["projects"]) - return cfg - - -@pytest.fixture -def session(tmp_path: Path, session: Session) -> Session: - """Session fixture with default local data sources.""" - projects = _load_default_data_sources()["projects"] - for project in projects: - data_sources = projects[project]["data"] - for data_source in data_sources.values(): - data_source["rootpath"] = str(tmp_path) - session["projects"][project]["data"] = data_sources - return session - - def test_repr(): ds = Dataset(short_name="tas", dataset="dataset1") @@ -1365,211 +1324,1303 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session): assert datasets == [expected] -def test_match(): - dataset1 = Dataset( - short_name="areacella", - ensemble=["r1i1p1f1"], - exp="historical", - modeling_realm=["atmos", "land"], - ) - dataset2 = Dataset( - short_name="tas", - ensemble="r1i1p1f1", - exp=["historical", "ssp585"], - modeling_realm=["atmos"], - ) +OBS6_SAT_FACETS: Facets = { + "project": "OBS6", + "dataset": "SAT", + "mip": "Amon", + "tier": 2, + "type": "sat", + "timerange": "1980/2000", +} - score = dataset1._match(dataset2) - assert score == 3 +def test_from_files_no_files_glob(session): + dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas") + datasets = list(dataset.from_files()) + assert datasets == [dataset] -def test_remove_duplicate_supplementaries(): + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_files_glob(timerange, session): dataset = Dataset( - dataset="dataset1", - short_name="tas", - mip="Amon", - project="CMIP6", - exp="historical", + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, ) - supplementary1 = dataset.copy(short_name="areacella") - supplementary2 = supplementary1.copy() - supplementary1.facets["exp"] = "1pctCO2" - dataset.supplementaries = [supplementary1, supplementary2] + datasets = list(dataset.from_files()) + assert datasets == [dataset] - dataset._remove_duplicate_supplementaries() - assert len(dataset.supplementaries) == 1 - assert dataset.supplementaries[0] == supplementary2 +@pytest.fixture +def lwcre_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre.touch() + return lwcre -def test_remove_not_found_supplementaries(): - dataset = Dataset( - dataset="dataset1", - short_name="tas", - mip="Amon", - project="CMIP6", - exp="historical", +@pytest.fixture +def lwcre_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc", ) - dataset.add_supplementary(short_name="areacella", mip="fx", exp="*") - dataset._remove_unexpanded_supplementaries() + lwcre.touch() + return lwcre - assert len(dataset.supplementaries) == 0 +@pytest.fixture +def rlut_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut -def test_concatenating_historical_and_future_exps(mocker): - mocker.patch.object(Dataset, "files", True) - dataset = Dataset( - dataset="dataset1", - short_name="tas", - mip="Amon", - frequency="mon", - project="CMIP6", - exp=["historical", "ssp585"], + +@pytest.fixture +def rlut_file_future(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlut_2100-2101.nc", ) - dataset.add_supplementary(short_name="areacella", mip="fx", frequency="fx") - dataset._fix_fx_exp() + rlut.touch() + return rlut - assert len(dataset.supplementaries) == 1 - assert dataset.facets["exp"] == ["historical", "ssp585"] - assert dataset.supplementaries[0].facets["exp"] == "historical" +@pytest.fixture +def rlut_file_ground(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc", + ) + rlut.touch() + return rlut -def test_from_recipe_with_glob(tmp_path: Path, session: Session) -> None: - recipe_txt = textwrap.dedent(""" - diagnostics: - diagnostic1: - variables: - tas: - project: CMIP5 - mip: Amon - exp: rcp85 - ensemble: r1i1p1 - additional_datasets: - - {dataset: '*', institute: '*'} - """) - recipe = tmp_path / "recipe_test.yml" - recipe.write_text(recipe_txt, encoding="utf-8") +@pytest.fixture +def rlutcs_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlutcs = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc", + ) + rlutcs.touch() + return rlutcs - filenames = [ - "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/" - "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc", - "cmip5/output1/NIMR-KMA/HadGEM2-AO/rcp85/mon/atmos/Amon/r1i1p1/" - "v20130815/tas_Amon_HadGEM2-AO_rcp85_r1i1p1_186001-200512.nc", - ] - for filename in filenames: - path = tmp_path / filename - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text("") - definitions: list[Facets] = [ - { - "diagnostic": "diagnostic1", - "variable_group": "tas", - "dataset": "CSIRO-Mk3-6-0", - "project": "CMIP5", - "mip": "Amon", - "short_name": "tas", - "alias": "CSIRO-Mk3-6-0", - "recipe_dataset_index": 0, - "exp": "rcp85", - "ensemble": "r1i1p1", - "institute": "CSIRO-QCCCE", - }, - { - "diagnostic": "diagnostic1", - "variable_group": "tas", - "dataset": "HadGEM2-AO", - "project": "CMIP5", - "mip": "Amon", - "short_name": "tas", - "alias": "HadGEM2-AO", - "recipe_dataset_index": 1, - "exp": "rcp85", - "ensemble": "r1i1p1", - "institute": "NIMR-KMA", - }, - ] - expected = [] - for facets in definitions: - dataset = Dataset(**facets) - dataset.session = session - expected.append(dataset) +@pytest.fixture +def pr_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + pr = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc", + ) + pr.touch() + return pr - datasets = Dataset.from_recipe(recipe, session) - print("Expected:", expected) - print("Got:", datasets) - assert all(ds.session == session for ds in datasets) - assert datasets == expected +@pytest.fixture +def siconca_file(tmp_path): + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + rlut = esmvalcore.local.LocalFile( + input_dir, + "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc", + ) + rlut.touch() + return rlut -def test_from_ranges(): - dataset = Dataset(ensemble="r(1:2)i1p1f1") - expected = [ - Dataset(ensemble="r1i1p1f1"), - Dataset(ensemble="r2i1p1f1"), - ] - assert dataset.from_ranges() == expected +def test_from_files_with_derived_no_derivation(lwcre_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) -def test_expand_ensemble(): - dataset = Dataset(ensemble="r(1:2)i(2:3)p(3:4)") + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session - expanded = dataset._expand_range("ensemble") + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] - ensembles = [ - "r1i2p3", - "r1i2p4", - "r1i3p3", - "r1i3p4", - "r2i2p3", - "r2i2p4", - "r2i3p3", - "r2i3p4", + expected_required_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_required_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), ] - assert expanded == ensembles + expected_required_dataset.session = session + required_datasets = datasets[0].required_datasets + assert required_datasets == [expected_required_dataset] + assert required_datasets[0].files == [lwcre_file] -def test_expand_subexperiment(): - dataset = Dataset(sub_experiment="s(1998:2005)") - expanded = dataset._expand_range("sub_experiment") +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_derivation_glob( + timerange, + lwcre_file, + lwcre_file_ground, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session - subexperiments = [ - "s1998", - "s1999", - "s2000", - "s2001", - "s2002", - "s2003", - "s2004", - "s2005", - ] + datasets = list(dataset.from_files()) - assert expanded == subexperiments + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] -def test_expand_ensemble_list_ok(): - dataset = Dataset(ensemble=["r0i0p0", "r1i1p1"]) + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session - expected = [["r0i0p0", "r1i1p1"]] + for dataset, expected in zip( + datasets, + expected_required_datasets, + strict=True, + ): + assert dataset.required_datasets == [expected] + assert datasets[0].required_datasets[0].files == [lwcre_file_ground] + assert datasets[1].required_datasets[0].files == [lwcre_file] - assert dataset._expand_range("ensemble") == expected +def test_from_files_with_derived(rlut_file, rlutcs_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session -def test_expand_ensemble_nolist(): - dataset = Dataset( - dataset="XYZ", - ensemble=["r1i1p1", "r(1:2)i1p1"], - ) + datasets = list(dataset.from_files()) - with pytest.raises(RecipeError): - dataset._expand_range("ensemble") + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [] -def create_esgf_file(timerange, version): - """Prepare some fake ESGF search results.""" - json = { + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + +def test_from_files_with_derived_unavailable_years( + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="lwcre", + derive=True, + ) + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="lwcre", + derive=True, + ) + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "2010/2015"}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob( + timerange, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + +def test_from_files_with_derived_glob_differing_timerange( + rlut_file_future, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + expected.add_supplementary(short_name="pr", timerange="1980/2000") + expected.session = session + assert datasets == [expected] + assert datasets[0].files == [] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] + + +def test_from_files_with_derived_no_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_required_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ) + expected_required_dataset.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_required_dataset.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == [expected_required_dataset] + assert required_datasets[0].files == [lwcre_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_no_force_derivation_glob( # noqa: PLR0913 + timerange, + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + ), + Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True), + ] + for expected_ds in expected_datasets: + expected_ds.add_supplementary(short_name="pr", type="sat") + expected_ds.session = session + + assert datasets == expected_datasets + assert datasets[0].files == [lwcre_file_ground] + assert datasets[0].supplementaries[0].files == [pr_file] + assert datasets[1].files == [lwcre_file] + assert datasets[1].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "ground"}, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + frequency="mon", + long_name="TOA Longwave Cloud Radiative Effect", + modeling_realm=["atmos"], + original_short_name="lwcre", + standard_name="", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.supplementaries = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + expected_ds.session = session + + for dataset, expected in zip( + datasets, + expected_required_datasets, + strict=True, + ): + assert dataset.required_datasets == [expected] + assert datasets[0].required_datasets[0].files == [lwcre_file_ground] + assert datasets[1].required_datasets[0].files == [lwcre_file] + + +def test_from_files_with_derived_force_derivation( + lwcre_file, + rlut_file, + rlutcs_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_force_derivation_glob( # noqa: PLR0913 + timerange, + lwcre_file, + lwcre_file_ground, + rlut_file, + rlut_file_ground, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="lwcre", + derive=True, + force_derivation=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [lwcre_file] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + force_derivation=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [rlut_file] + assert required_datasets[1].files == [rlutcs_file] + + +class DerivedVariable(DerivedVariableBase): + """Derivation of dummy variable.""" + + @staticmethod + def required(project): + """Declare the variables needed for derivation.""" + return [ + {"short_name": "rlut", "optional": True}, + {"short_name": "rlutcs"}, + {"short_name": "pr"}, + ] + + +def test_from_files_with_derived_optional( + monkeypatch, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + monkeypatch.setattr( + esmvalcore.preprocessor._derive, + "ALL_DERIVED_VARIABLES", + {"tas": DerivedVariable}, + ) + dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + derive=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + optional=True, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [rlutcs_file] + assert required_datasets[2].files == [pr_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob_optional( + timerange, + monkeypatch, + rlutcs_file, + pr_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + monkeypatch.setattr( + esmvalcore.preprocessor._derive, + "ALL_DERIVED_VARIABLES", + {"tas": DerivedVariable}, + ) + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="tas", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **OBS6_SAT_FACETS, + short_name="tas", + derive=True, + ) + expected.add_supplementary(short_name="pr") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + optional=True, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [rlutcs_file] + assert required_datasets[2].files == [pr_file] + + +@pytest.mark.parametrize("timerange", ["1980/2000", "*"]) +def test_from_files_with_derived_glob_optional_missing( + timerange, + monkeypatch, + rlut_file, + session, +): + """Test `from_files` with derived variable and supplementary.""" + monkeypatch.setattr( + esmvalcore.preprocessor._derive, + "ALL_DERIVED_VARIABLES", + {"tas": DerivedVariable}, + ) + dataset = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="tas", + derive=True, + ) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="tas", + derive=True, + ) + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="rlut", + derive=False, + optional=True, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + Dataset( + **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange}, + short_name="pr", + derive=False, + frequency="mon", + long_name="Precipitation", + modeling_realm=["atmos"], + original_short_name="pr", + standard_name="precipitation_flux", + units="kg m-2 s-1", + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [] + assert required_datasets[2].files == [] + + +def test_from_files_with_derived_only_optional(siconca_file, pr_file, session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="siextent", + derive=True, + ) + dataset.add_supplementary(short_name="pr", mip="Amon") + dataset.session = session + + datasets = list(dataset.from_files()) + + expected = Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="siextent", + derive=True, + ) + expected.add_supplementary(short_name="pr", mip="Amon") + expected.session = session + + assert datasets == [expected] + assert datasets[0].files == [] + assert datasets[0].supplementaries[0].files == [pr_file] + + expected_required_datasets = [ + Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="sic", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Ocean Grid)", + modeling_realm=["seaIce"], + original_short_name="siconc", + standard_name="sea_ice_area_fraction", + units="%", + optional=True, + ), + Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon"}, + short_name="siconca", + derive=False, + frequency="mon", + long_name="Sea-Ice Area Percentage (Atmospheric Grid)", + modeling_realm=["seaIce"], + original_short_name="siconca", + standard_name="sea_ice_area_fraction", + units="%", + optional=True, + ), + ] + for expected_ds in expected_required_datasets: + expected_ds.session = session + + required_datasets = datasets[0].required_datasets + assert required_datasets == expected_required_datasets + assert required_datasets[0].files == [] + assert required_datasets[1].files == [siconca_file] + + +def test_from_files_with_derived_only_optional_glob_fail(session): + """Test `from_files` with derived variable and supplementary.""" + dataset = Dataset( + **{**OBS6_SAT_FACETS, "mip": "SImon", "type": "*"}, + short_name="siextent", + derive=True, + ) + dataset.add_supplementary(short_name="pr", mip="Amon") + dataset.session = session + + msg = r"Using wildcards to derive .* is not possible" + with pytest.raises(RecipeError, match=msg): + next(dataset.from_files()) + + +def test_match(): + dataset1 = Dataset( + short_name="areacella", + ensemble=["r1i1p1f1"], + exp="historical", + modeling_realm=["atmos", "land"], + ) + dataset2 = Dataset( + short_name="tas", + ensemble="r1i1p1f1", + exp=["historical", "ssp585"], + modeling_realm=["atmos"], + ) + + score = dataset1._match(dataset2) + assert score == 3 + + +def test_remove_duplicate_supplementaries(): + dataset = Dataset( + dataset="dataset1", + short_name="tas", + mip="Amon", + project="CMIP6", + exp="historical", + ) + supplementary1 = dataset.copy(short_name="areacella") + supplementary2 = supplementary1.copy() + supplementary1.facets["exp"] = "1pctCO2" + dataset.supplementaries = [supplementary1, supplementary2] + + dataset._remove_duplicate_supplementaries() + + assert len(dataset.supplementaries) == 1 + assert dataset.supplementaries[0] == supplementary2 + + +def test_remove_not_found_supplementaries(): + dataset = Dataset( + dataset="dataset1", + short_name="tas", + mip="Amon", + project="CMIP6", + exp="historical", + ) + dataset.add_supplementary(short_name="areacella", mip="fx", exp="*") + dataset._remove_unexpanded_supplementaries() + + assert len(dataset.supplementaries) == 0 + + +def test_concatenating_historical_and_future_exps(mocker): + mocker.patch.object(Dataset, "files", True) + dataset = Dataset( + dataset="dataset1", + short_name="tas", + mip="Amon", + frequency="mon", + project="CMIP6", + exp=["historical", "ssp585"], + ) + dataset.add_supplementary(short_name="areacella", mip="fx", frequency="fx") + dataset._fix_fx_exp() + + assert len(dataset.supplementaries) == 1 + assert dataset.facets["exp"] == ["historical", "ssp585"] + assert dataset.supplementaries[0].facets["exp"] == "historical" + + +def test_from_recipe_with_glob(tmp_path: Path, session: Session) -> None: + recipe_txt = textwrap.dedent(""" + + diagnostics: + diagnostic1: + variables: + tas: + project: CMIP5 + mip: Amon + exp: rcp85 + ensemble: r1i1p1 + additional_datasets: + - {dataset: '*', institute: '*'} + """) + recipe = tmp_path / "recipe_test.yml" + recipe.write_text(recipe_txt, encoding="utf-8") + + filenames = [ + "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/" + "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc", + "cmip5/output1/NIMR-KMA/HadGEM2-AO/rcp85/mon/atmos/Amon/r1i1p1/" + "v20130815/tas_Amon_HadGEM2-AO_rcp85_r1i1p1_186001-200512.nc", + ] + for filename in filenames: + path = tmp_path / filename + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("") + + definitions: list[Facets] = [ + { + "diagnostic": "diagnostic1", + "variable_group": "tas", + "dataset": "CSIRO-Mk3-6-0", + "project": "CMIP5", + "mip": "Amon", + "short_name": "tas", + "alias": "CSIRO-Mk3-6-0", + "recipe_dataset_index": 0, + "exp": "rcp85", + "ensemble": "r1i1p1", + "institute": "CSIRO-QCCCE", + }, + { + "diagnostic": "diagnostic1", + "variable_group": "tas", + "dataset": "HadGEM2-AO", + "project": "CMIP5", + "mip": "Amon", + "short_name": "tas", + "alias": "HadGEM2-AO", + "recipe_dataset_index": 1, + "exp": "rcp85", + "ensemble": "r1i1p1", + "institute": "NIMR-KMA", + }, + ] + expected = [] + for facets in definitions: + dataset = Dataset(**facets) + dataset.session = session + expected.append(dataset) + + datasets = Dataset.from_recipe(recipe, session) + print("Expected:", expected) + print("Got:", datasets) + assert all(ds.session == session for ds in datasets) + assert datasets == expected + + +def test_from_ranges(): + dataset = Dataset(ensemble="r(1:2)i1p1f1") + expected = [ + Dataset(ensemble="r1i1p1f1"), + Dataset(ensemble="r2i1p1f1"), + ] + assert dataset.from_ranges() == expected + + +def test_expand_ensemble(): + dataset = Dataset(ensemble="r(1:2)i(2:3)p(3:4)") + + expanded = dataset._expand_range("ensemble") + + ensembles = [ + "r1i2p3", + "r1i2p4", + "r1i3p3", + "r1i3p4", + "r2i2p3", + "r2i2p4", + "r2i3p3", + "r2i3p4", + ] + assert expanded == ensembles + + +def test_expand_subexperiment(): + dataset = Dataset(sub_experiment="s(1998:2005)") + + expanded = dataset._expand_range("sub_experiment") + + subexperiments = [ + "s1998", + "s1999", + "s2000", + "s2001", + "s2002", + "s2003", + "s2004", + "s2005", + ] + + assert expanded == subexperiments + + +def test_expand_ensemble_list_ok(): + dataset = Dataset(ensemble=["r0i0p0", "r1i1p1"]) + + expected = [["r0i0p0", "r1i1p1"]] + + assert dataset._expand_range("ensemble") == expected + + +def test_expand_ensemble_nolist(): + dataset = Dataset( + dataset="XYZ", + ensemble=["r1i1p1", "r(1:2)i1p1"], + ) + + with pytest.raises(RecipeError): + dataset._expand_range("ensemble") + + +def create_esgf_file(timerange, version): + """Prepare some fake ESGF search results.""" + json = { "dataset_id": "CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3.historical" f".r1i1p1f1.Amon.tas.gr.{version}|esgf-data1.llnl.gov", "dataset_id_template_": [ @@ -1766,7 +2817,7 @@ def test_find_files_outdated_local( assert dataset.files == expected -def test_set_version(): +def test_set_version_non_derived_var(): dataset = Dataset(short_name="tas") dataset.add_supplementary(short_name="areacella") file_v1 = esmvalcore.io.local.LocalFile("/path/to/v1/tas.nc") @@ -1782,6 +2833,47 @@ def test_set_version(): assert dataset.supplementaries[0].facets["version"] == "v3" +def test_set_version_derived_var(monkeypatch, session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="areacella") + dataset.files = [] + areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc") + areacella_file.facets["version"] = "v4" + dataset.supplementaries[0].files = [areacella_file] + + def _get_required_datasets(): + rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc") + rlut_file.facets["version"] = "v1" + rlut_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + ) + rlut_dataset.files = [rlut_file] + rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc") + rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc") + rlutcs_file_1.facets["version"] = "v2" + rlutcs_file_2.facets["version"] = "v3" + rlutcs_dataset = Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + ) + rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2] + return [rlut_dataset, rlutcs_dataset] + + monkeypatch.setattr( + dataset, + "_get_required_datasets", + _get_required_datasets, + ) + + dataset.set_version() + + assert dataset.facets["version"] == ["v1", "v2", "v3"] + assert dataset.supplementaries[0].facets["version"] == "v4" + + @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"]) def test_update_timerange_from_esgf(mocker, timerange): esgf_files = [ @@ -1853,9 +2945,8 @@ def test_update_timerange_no_files(session, search_data): } dataset = Dataset(**variable) dataset.files = [] - msg = r"Missing data for Dataset: tas, Amon, CMIP6, HadGEM3-GC31-LL.*" - with pytest.raises(InputFilesNotFound, match=msg): - dataset._update_timerange() + dataset._update_timerange() + assert "timerange" not in dataset.facets def test_update_timerange_typeerror(): @@ -2296,16 +3387,6 @@ def test_get_extra_facets_native6(): } -OBS6_SAT_FACETS: Facets = { - "project": "OBS6", - "dataset": "SAT", - "mip": "Amon", - "tier": 2, - "type": "sat", - "timerange": "1980/2000", -} - - def test_is_derived_no_derivation(): dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") assert dataset._is_derived() is False @@ -2358,6 +3439,15 @@ def test_derivation_necessary_no_force_derivation_no_files( assert dataset._derivation_necessary() is True +def test_derivation_necessary_no_force_derivation_no_files_glob(session): + dataset = Dataset( + **{**OBS6_SAT_FACETS, "timerange": "*"}, + short_name="lwcre", + derive=True, + ) + assert dataset._derivation_necessary() is True + + def test_derivation_necessary_no_force_derivation(tmp_path, session): dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) dataset.session = session @@ -2431,3 +3521,67 @@ def test_add_derived_supplementary_to_derived(): force_derivation=True, ) assert dataset.supplementaries[0] == expected_supplementary + + +def test_required_datasets_derivation(session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + + expected_datasets = [ + Dataset( + **OBS6_SAT_FACETS, + short_name="rlut", + derive=False, + frequency="mon", + long_name="TOA Outgoing Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlut", + standard_name="toa_outgoing_longwave_flux", + units="W m-2", + ), + Dataset( + **OBS6_SAT_FACETS, + short_name="rlutcs", + derive=False, + frequency="mon", + long_name="TOA Outgoing Clear-Sky Longwave Radiation", + modeling_realm=["atmos"], + original_short_name="rlutcs", + standard_name="toa_outgoing_longwave_flux_assuming_clear_sky", + units="W m-2", + ), + ] + for expected_dataset in expected_datasets: + expected_dataset.session = dataset.session + + assert dataset.required_datasets == expected_datasets + + +def test_required_datasets_no_derivation(): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas") + dataset.add_supplementary(short_name="pr") + + assert dataset.required_datasets == [dataset] + + +def test_required_datasets_no_force_derivation(tmp_path, session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.add_supplementary(short_name="pr") + dataset.session = session + + input_dir = tmp_path / "Tier2" / "SAT" + input_dir.mkdir(parents=True, exist_ok=True) + lwcre_file = esmvalcore.local.LocalFile( + input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc", + ) + lwcre_file.touch() + + assert dataset.required_datasets == [dataset] + + +def test_required_datasets_no_derivation_available(session): + dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True) + + msg = r"Cannot derive variable 'tas': no derivation script available" + with pytest.raises(NotImplementedError, match=msg): + dataset.required_datasets # noqa: B018