diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
index 1aae4b6aef..b630994682 100644
--- a/esmvalcore/_recipe/check.py
+++ b/esmvalcore/_recipe/check.py
@@ -37,7 +37,6 @@
 
     from esmvalcore._task import TaskSet
     from esmvalcore.dataset import Dataset
-    from esmvalcore.typing import Facets
 
 
 logger = logging.getLogger(__name__)
@@ -504,20 +503,6 @@ def valid_time_selection(timerange: str) -> None:
             _check_timerange_values(_format_years(date), timerange_list)
 
 
-def differing_timeranges(
-    timeranges: set[str],
-    required_vars: list[Facets],
-) -> None:
-    """Log error if required variables have differing timeranges."""
-    if len(timeranges) > 1:
-        msg = (
-            f"Differing timeranges with values {timeranges} "
-            f"found for required variables {required_vars}. "
-            "Set `timerange` to a common value."
-        )
-        raise ValueError(msg)
-
-
 def _check_literal(
     settings: dict,
     *,
diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
index 38f48fc663..d54d35fba5 100644
--- a/esmvalcore/_recipe/recipe.py
+++ b/esmvalcore/_recipe/recipe.py
@@ -52,8 +52,7 @@
 from . import check
 from .from_datasets import datasets_to_recipe
 from .to_datasets import (
-    _derive_needed,
-    _get_input_datasets,
+    _get_required_datasets,
     _representative_datasets,
 )
 
@@ -251,7 +250,7 @@ def _get_default_settings(dataset: Dataset) -> PreprocessorSettings:
 
     settings = {}
 
-    if _derive_needed(dataset):
+    if dataset._derivation_necessary():  # noqa: SLF001 (will be replaced soon)
         settings["derive"] = {
             "short_name": facets["short_name"],
             "standard_name": facets["standard_name"],
@@ -622,21 +621,26 @@ def _allow_skipping(dataset: Dataset) -> bool:
     )
 
 
-def _set_version(dataset: Dataset, input_datasets: list[Dataset]) -> None:
-    """Set the 'version' facet based on derivation input datasets."""
-    versions = set()
-    for in_dataset in input_datasets:
-        in_dataset.set_version()
-        if version := in_dataset.facets.get("version"):
-            if isinstance(version, list):
-                versions.update(version)
-            else:
-                versions.add(version)
-    if versions:
-        version = versions.pop() if len(versions) == 1 else sorted(versions)
-        dataset.set_facet("version", version)
-    for supplementary_ds in dataset.supplementaries:
-        supplementary_ds.set_version()
+def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
+    """Automatically correct the wrong ensemble for CMIP5 fx variables."""
+    if (
+        dataset.facets.get("project") == "CMIP5"
+        and dataset.facets.get("mip") == "fx"
+        and dataset.facets.get("ensemble") != "r0i0p0"
+        and not dataset.files
+    ):
+        original_ensemble = dataset["ensemble"]
+        copy = dataset.copy()
+        copy.facets["ensemble"] = "r0i0p0"
+        if copy.files:
+            dataset.facets["ensemble"] = "r0i0p0"
+            logger.info(
+                "Corrected wrong 'ensemble' from '%s' to '%s' for %s",
+                original_ensemble,
+                dataset["ensemble"],
+                dataset.summary(shorten=True),
+            )
+            dataset.find_files()
 
 
 def _get_preprocessor_products(
@@ -662,28 +666,29 @@ def _get_preprocessor_products(
         settings = _get_default_settings(dataset)
         _apply_preprocessor_profile(settings, profile)
         _update_multi_dataset_settings(dataset.facets, settings)
+        _fix_cmip5_fx_ensemble(dataset)
         _update_preproc_functions(settings, dataset, datasets, missing_vars)
         _add_dataset_specific_settings(dataset, settings)
         check.preprocessor_supplementaries(dataset, settings)
-        input_datasets = _get_input_datasets(dataset)
-        missing = _check_input_files(input_datasets)
+        required_datasets = _get_required_datasets(dataset)
+        missing = _check_input_files(required_datasets)
         if missing:
             if _allow_skipping(dataset):
                 logger.info("Skipping: %s", missing)
             else:
                 missing_vars.update(missing)
             continue
-        _set_version(dataset, input_datasets)
+        dataset.set_version()
         USED_DATASETS.append(dataset)
-        _schedule_for_download(input_datasets)
-        _log_input_files(input_datasets)
+        _schedule_for_download(required_datasets)
+        _log_input_files(required_datasets)
         logger.info("Found input files for %s", dataset.summary(shorten=True))
         filename = _get_preprocessor_filename(dataset)
         product = PreprocessorFile(
             filename=filename,
             attributes=dataset.facets,
             settings=settings,
-            datasets=input_datasets,
+            datasets=required_datasets,
         )
 
         products.add(product)
diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py
index 460b42fca5..baf0cb7246 100644
--- a/esmvalcore/_recipe/to_datasets.py
+++ b/esmvalcore/_recipe/to_datasets.py
@@ -13,7 +13,6 @@
 from esmvalcore.exceptions import RecipeError
 from esmvalcore.io.esgf.facets import FACETS
 from esmvalcore.io.local import _replace_years_with_timerange
-from esmvalcore.preprocessor._derive import get_required
 from esmvalcore.preprocessor._io import DATASET_KEYS
 from esmvalcore.preprocessor._supplementary_vars import (
     PREPROCESSOR_SUPPLEMENTARIES,
@@ -189,28 +188,6 @@ def _merge_supplementary_dicts(
     return list(merged.values())
 
 
-def _fix_cmip5_fx_ensemble(dataset: Dataset) -> None:
-    """Automatically correct the wrong ensemble for CMIP5 fx variables."""
-    if (
-        dataset.facets.get("project") == "CMIP5"
-        and dataset.facets.get("mip") == "fx"
-        and dataset.facets.get("ensemble") != "r0i0p0"
-        and not dataset.files
-    ):
-        original_ensemble = dataset["ensemble"]
-        copy = dataset.copy()
-        copy.facets["ensemble"] = "r0i0p0"
-        if copy.files:
-            dataset.facets["ensemble"] = "r0i0p0"
-            logger.info(
-                "Corrected wrong 'ensemble' from '%s' to '%s' for %s",
-                original_ensemble,
-                dataset["ensemble"],
-                dataset.summary(shorten=True),
-            )
-            dataset.find_files()
-
-
 def _get_supplementary_short_names(
     facets: Facets,
     step: str,
@@ -434,9 +411,7 @@ def datasets_from_recipe(
     return datasets
 
 
-def _dataset_from_files(  # noqa: C901
-    dataset: Dataset,
-) -> list[Dataset]:
+def _dataset_from_files(dataset: Dataset) -> list[Dataset]:
     """Replace facet values of '*' based on available files."""
     result: list[Dataset] = []
     errors: list[str] = []
@@ -447,53 +422,25 @@ def _dataset_from_files(  # noqa: C901
             dataset.summary(shorten=True),
         )
 
-    representative_datasets = _representative_datasets(dataset)
-
-    # For derived variables, representative_datasets might contain more than
-    # one element
-    all_datasets: list[list[tuple[dict, Dataset]]] = []
-    for representative_dataset in representative_datasets:
-        all_datasets.append([])
-        for expanded_ds in representative_dataset.from_files():
-            updated_facets = {}
-            unexpanded_globs = {}
-            for key, value in dataset.facets.items():
-                if _isglob(value):
-                    if key in expanded_ds.facets and not _isglob(
-                        expanded_ds[key],
-                    ):
-                        updated_facets[key] = expanded_ds.facets[key]
-                    else:
-                        unexpanded_globs[key] = value
-
-            if unexpanded_globs:
-                msg = _report_unexpanded_globs(
-                    dataset,
-                    expanded_ds,
-                    unexpanded_globs,
-                )
-                errors.append(msg)
-                continue
-
-            new_ds = dataset.copy()
-            new_ds.facets.update(updated_facets)
-            new_ds.supplementaries = expanded_ds.supplementaries
-
-            all_datasets[-1].append((updated_facets, new_ds))
-
-    # If globs have been expanded, only consider those datasets that contain
-    # all necessary input variables if derivation is necessary
-    for updated_facets, new_ds in all_datasets[0]:
-        other_facets = [[d[0] for d in ds] for ds in all_datasets[1:]]
-        if all(updated_facets in facets for facets in other_facets):
-            result.append(new_ds)
-        else:
-            logger.debug(
-                "Not all necessary input variables to derive '%s' are "
-                "available for dataset %s",
-                dataset["short_name"],
-                updated_facets,
+    # The magic happens in Dataset.from_files. Here, we simply check if any
+    # wildcards have not been expanded and raise proper errors if necessary.
+    for expanded_ds in dataset.from_files():
+        unexpanded_globs = {}
+        for key, value in dataset.facets.items():
+            if _isglob(value):
+                if key not in expanded_ds.facets or _isglob(expanded_ds[key]):
+                    unexpanded_globs[key] = value
+
+        if unexpanded_globs:
+            msg = _report_unexpanded_globs(
+                dataset,
+                expanded_ds,
+                unexpanded_globs,
             )
+            errors.append(msg)
+            continue
+
+        result.append(expanded_ds)
 
     if errors:
         raise RecipeError("\n".join(errors))
@@ -538,66 +485,33 @@ def _report_unexpanded_globs(
     return msg
 
 
-def _derive_needed(dataset: Dataset) -> bool:
-    """Check if dataset needs to be derived from other datasets."""
-    if not dataset.facets.get("derive"):
-        return False
-    if dataset.facets.get("force_derivation"):
-        return True
-    if _isglob(dataset.facets.get("timerange", "")):
-        # Our file finding routines are not able to handle globs.
-        dataset = dataset.copy()
-        dataset.facets.pop("timerange")
-
-    copy = dataset.copy()
-    copy.supplementaries = []
-    return not copy.files
+def _get_required_datasets(dataset: Dataset) -> list[Dataset]:
+    """Determine the datasets required for deriving `dataset`."""
+    if not dataset._derivation_necessary():  # noqa: SLF001
+        return dataset.required_datasets
 
-
-def _get_input_datasets(dataset: Dataset) -> list[Dataset]:
-    """Determine the input datasets needed for deriving `dataset`."""
-    facets = dataset.facets
-    if not _derive_needed(dataset):
-        _fix_cmip5_fx_ensemble(dataset)
-        return [dataset]
-
-    # Configure input datasets needed to derive variable
-    datasets = []
-    required_vars = get_required(facets["short_name"], facets["project"])  # type: ignore
-    # idea: add option to specify facets in list of dicts that is value of
-    # 'derive' in the recipe and use that instead of get_required?
-    for input_facets in required_vars:
-        input_dataset = dataset.copy()
-        keep = {"alias", "recipe_dataset_index", *dataset.minimal_facets}
-        input_dataset.facets = {
-            k: v for k, v in input_dataset.facets.items() if k in keep
-        }
-        input_dataset.facets.update(input_facets)
-        input_dataset.augment_facets()
-        _fix_cmip5_fx_ensemble(input_dataset)
-        if input_facets.get("optional") and not input_dataset.files:
+    # Skip optional datasets if no data is available
+    required_datasets: list[Dataset] = []
+    for required_dataset in dataset.required_datasets:
+        if (
+            required_dataset.facets.get("optional")
+            and not required_dataset.files
+        ):
             logger.info(
                 "Skipping: no data found for %s which is marked as 'optional'",
-                input_dataset,
+                required_dataset,
             )
         else:
-            datasets.append(input_dataset)
+            required_datasets.append(required_dataset)
 
-    # Check timeranges of available input data.
-    timeranges: set[str] = set()
-    for input_dataset in datasets:
-        if "timerange" in input_dataset.facets:
-            timeranges.add(input_dataset.facets["timerange"])  # type: ignore
-    check.differing_timeranges(timeranges, required_vars)
-
-    return datasets
+    return required_datasets
 
 
 def _representative_datasets(dataset: Dataset) -> list[Dataset]:
     """Find representative datasets for all input variables."""
     copy = dataset.copy()
     copy.supplementaries = []
-    representative_datasets = _get_input_datasets(copy)
+    representative_datasets = _get_required_datasets(copy)
     for representative_dataset in representative_datasets:
         representative_dataset.supplementaries = dataset.supplementaries
     return representative_datasets
diff --git a/esmvalcore/config/_validated_config.py b/esmvalcore/config/_validated_config.py
index 505434c419..7a5af93224 100644
--- a/esmvalcore/config/_validated_config.py
+++ b/esmvalcore/config/_validated_config.py
@@ -60,9 +60,9 @@ class ValidatedConfig(MutableMapping):
     """
 
     # validate values on the way in
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__()
-        self._mapping = {}
+        self._mapping: dict[str, Any] = {}
         self.update(*args, **kwargs)
 
     def __setitem__(self, key, val):
diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py
index 41690dd8a9..6d826b7ca2 100644
--- a/esmvalcore/dataset.py
+++ b/esmvalcore/dataset.py
@@ -33,6 +33,7 @@
 from esmvalcore.exceptions import InputFilesNotFound, RecipeError
 from esmvalcore.io.local import _dates_to_timerange
 from esmvalcore.preprocessor import _get_preprocessor_filename, preprocess
+from esmvalcore.preprocessor._derive import get_required
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Sequence
@@ -104,7 +105,7 @@ class Dataset:
 
     Attributes
     ----------
-    supplementaries : list[Dataset]
+    supplementaries: list[Dataset]
         List of supplementary datasets.
     facets: :obj:`esmvalcore.typing.Facets`
         Facets describing the dataset.
@@ -136,6 +137,7 @@ def __init__(self, **facets: FacetValue) -> None:
         self._session: Session | None = None
         self._files: Sequence[DataElement] | None = None
         self._used_data_sources: Sequence[DataSource] = []
+        self._required_datasets: list[Dataset] | None = None
 
         for key, value in facets.items():
             self.set_facet(key, deepcopy(value), persist=True)
@@ -180,43 +182,103 @@ def _is_force_derived(self) -> bool:
 
     def _derivation_necessary(self) -> bool:
         """Return ``True`` if derivation is necessary, ``False`` otherwise."""
-        # If variable cannot be derived, derivation is not necessary
-        if not self._is_derived():
-            return False
+        return not (
+            self.required_datasets and self.required_datasets[0] is self
+        )
 
-        # If forced derivation is requested, derivation is necessary
-        if self._is_force_derived():
-            return True
+    def _get_required_datasets(self) -> list[Dataset]:
+        """Get required datasets for derivation."""
+        required_datasets: list[Dataset] = []
+        required_vars_facets = get_required(
+            self.facets["short_name"],  # type: ignore
+            self.facets["project"],  # type: ignore
+        )
+
+        for required_facets in required_vars_facets:
+            required_dataset = self._copy(derive=False, force_derivation=False)
+            keep = {"alias", "recipe_dataset_index", *self.minimal_facets}
+            required_dataset.facets = {
+                k: v for k, v in required_dataset.facets.items() if k in keep
+            }
+            required_dataset.facets.update(required_facets)
+            required_dataset.augment_facets()
+            required_datasets.append(required_dataset)
+
+        return required_datasets
+
+    @property
+    def required_datasets(self) -> list[Dataset]:
+        """Get required datasets.
+
+        For non-derived variables (i.e., those without a ``derive`` facet or
+        with facet ``derive=False``), this will simply return the dataset
+        itself in a list.
+
+        For derived variables (i.e., those with facet ``derive=True``), this
+        will return the datasets required for derivation if derivation is
+        necessary, and the dataset itself if derivation is not necessary.
+        Derivation is necessary if the facet ``force_derivation=True`` is set
+        or no files for the dataset itself are available.
 
-        # Otherwise, derivation is necessary of no files for the self dataset
-        # are found
-        ds_copy = self.copy()
-        ds_copy.supplementaries = []
-        return not ds_copy.files
+        See also :func:`esmvalcore.preprocessor.derive` for an example usage.
+
+        """
+        if self._required_datasets is not None:
+            return self._required_datasets
+
+        def _derivation_needed(dataset: Dataset) -> bool:
+            """Check if derivation is nedeed."""
+            # If variable cannot be derived, derivation is not necessary
+            if not dataset._is_derived():
+                return False
+
+            # If forced derivation is requested, derivation is necessary
+            if dataset._is_force_derived():
+                return True
+
+            # Otherwise, derivation is necessary if no files for the self
+            # dataset are found
+            ds_copy = dataset.copy()
+            ds_copy.supplementaries = []
+
+            # Avoid potential errors from missing data during timerange glob
+            # expansion
+            if _isglob(ds_copy.facets.get("timerange", "")):
+                ds_copy.facets.pop("timerange", None)
+
+            return not ds_copy.files
+
+        if not _derivation_needed(self):
+            self._required_datasets = [self]
+        else:
+            self._required_datasets = self._get_required_datasets()
 
+        return self._required_datasets
+
+    @staticmethod
     def _file_to_dataset(
-        self,
+        dataset: Dataset,
         file: DataElement,
     ) -> Dataset:
         """Create a dataset from a file with a `facets` attribute."""
         facets = dict(file.facets)
-        if "version" not in self.facets:
+        if "version" not in dataset.facets:
             # Remove version facet if no specific version requested
             facets.pop("version", None)
 
         updated_facets = {
             f: v
             for f, v in facets.items()
-            if f in self.facets
-            and _isglob(self.facets[f])
-            and _ismatch(v, self.facets[f])
+            if f in dataset.facets
+            and _isglob(dataset.facets[f])
+            and _ismatch(v, dataset.facets[f])
         }
-        dataset = self.copy()
-        dataset.facets.update(updated_facets)
+        new_dataset = dataset.copy()
+        new_dataset.facets.update(updated_facets)
 
         # If possible, remove unexpanded facets that can be automatically
         # populated.
-        unexpanded = {f for f, v in dataset.facets.items() if _isglob(v)}
+        unexpanded = {f for f, v in new_dataset.facets.items() if _isglob(v)}
         required_for_augment = {
             "project",
             "mip",
@@ -225,60 +287,139 @@ def _file_to_dataset(
             "dataset",
         }
         if unexpanded and not unexpanded & required_for_augment:
-            copy = dataset.copy()
+            copy = new_dataset.copy()
             copy.supplementaries = []
             for facet in unexpanded:
                 copy.facets.pop(facet)
             copy.augment_facets()
             for facet in unexpanded:
                 if facet in copy.facets:
-                    dataset.facets.pop(facet)
+                    new_dataset.facets.pop(facet)
 
-        return dataset
+        return new_dataset
 
-    def _get_available_datasets(self) -> Iterator[Dataset]:
+    @staticmethod
+    def _get_expanded_globs(
+        dataset_with_globs: Dataset,
+        dataset_with_expanded_globs: Dataset,
+    ) -> tuple[tuple[str, FacetValue], ...]:
+        """Get facets that have been updated by expanding globs."""
+        expanded_globs: dict[str, FacetValue] = {}
+        for key, value in dataset_with_globs.facets.items():
+            if (
+                _isglob(value)
+                and key in dataset_with_expanded_globs.facets
+                and not _isglob(dataset_with_expanded_globs[key])
+            ):
+                expanded_globs[key] = dataset_with_expanded_globs[key]
+        return tuple(expanded_globs.items())
+
+    @staticmethod
+    def _get_all_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
+        """Yield datasets based on the available files.
+
+        This function requires that dataset.facets['mip'] is not a glob
+        pattern.
+
+        Does take variable derivation into account, i.e., datasets available
+        through variable derivation are returned.
+
+        """
+        if not dataset._derivation_necessary():
+            yield from Dataset._get_available_datasets(dataset)
+            return
+
+        # Since we are in full control of the derived variables (the module is
+        # private; no custom derivation functions are possible), we can be sure
+        # that the following list is never empty
+        non_optional_datasets = [
+            d
+            for d in dataset.required_datasets
+            if not d.facets.get("optional", False)
+        ]
+        if not non_optional_datasets:
+            msg = (
+                f"Using wildcards to derive {dataset.summary(shorten=True)} "
+                f"is not possible, derivation function only requires optional "
+                f"variables"
+            )
+            raise RecipeError(msg)
+
+        # Record all expanded globs from first non-optional required dataset
+        # (called "reference_dataset" hereafter)
+        reference_dataset = non_optional_datasets[0]
+        reference_expanded_globs = {
+            Dataset._get_expanded_globs(dataset, ds)
+            for ds in Dataset._get_available_datasets(reference_dataset)
+        }
+
+        # Iterate through all other non-optional required datasets and only
+        # keep those expanded globs which are present for all other
+        # non-optional required datasets
+        for required_dataset in non_optional_datasets:
+            if required_dataset is reference_dataset:
+                continue
+            new_expanded_globs = {
+                Dataset._get_expanded_globs(dataset, ds)
+                for ds in Dataset._get_available_datasets(required_dataset)
+            }
+            reference_expanded_globs &= new_expanded_globs
+
+        # Use the final expanded globs to create new dataset(s)
+        for expanded_globs in reference_expanded_globs:
+            new_ds = dataset.copy()
+            new_ds.facets.update(dict(expanded_globs))
+            yield new_ds
+
+    @staticmethod
+    def _get_available_datasets(dataset: Dataset) -> Iterator[Dataset]:
         """Yield datasets based on the available files.
 
         This function requires that self.facets['mip'] is not a glob pattern.
+
+        Does not take variable derivation into account, i.e., datasets
+        potentially available through variable derivation are ignored. To
+        consider derived variables properly, use the function
+        :func:`_get_all_available_datasets`.
+
         """
-        dataset_template = self.copy()
+        dataset_template = dataset.copy()
         dataset_template.supplementaries = []
 
         seen = set()
         partially_defined = []
         expanded = False
         for file in dataset_template.files:
-            dataset = self._file_to_dataset(file)
-            # Do not use the timerange facet from the file because there may be multiple
-            # files per dataset.
-            dataset.facets.pop("timerange", None)
+            new_dataset = Dataset._file_to_dataset(dataset, file)
+            # Do not use the timerange facet from the file because there may be
+            # multiple files per dataset.
+            new_dataset.facets.pop("timerange", None)
             # Restore the original timerange facet if it was specified.
-            if "timerange" in self.facets:
-                dataset.facets["timerange"] = self.facets["timerange"]
+            if "timerange" in dataset.facets:
+                new_dataset.facets["timerange"] = dataset.facets["timerange"]
 
             # Filter out identical datasets
             facetset = frozenset(
                 (f, frozenset(v) if isinstance(v, list) else v)
-                for f, v in dataset.facets.items()
+                for f, v in new_dataset.facets.items()
             )
             if facetset not in seen:
                 seen.add(facetset)
                 if any(
                     _isglob(v)
-                    for f, v in dataset.facets.items()
+                    for f, v in new_dataset.facets.items()
                     if f != "timerange"
                 ):
-                    partially_defined.append((dataset, file))
+                    partially_defined.append((new_dataset, file))
                 else:
-                    dataset._update_timerange()  # noqa: SLF001
-                    dataset._supplementaries_from_files()  # noqa: SLF001
+                    new_dataset._update_timerange()  # noqa: SLF001
                     expanded = True
-                    yield dataset
+                    yield new_dataset
 
         # Only yield datasets with globs if there is no better alternative
-        for dataset, file in partially_defined:
+        for new_dataset, file in partially_defined:
             msg = (
-                f"{dataset} with unexpanded wildcards, created from file "
+                f"{new_dataset} with unexpanded wildcards, created from file "
                 f"{file} with facets {file.facets}. Please check why "
                 "the missing facets are not available for the file."
                 "This will depend on the data source they come from, e.g. can "
@@ -293,7 +434,7 @@ def _get_available_datasets(self) -> Iterator[Dataset]:
                     "because it still contains wildcards.",
                     msg,
                 )
-                yield dataset
+                yield new_dataset
 
     def from_files(self) -> Iterator[Dataset]:
         """Create datasets based on the available files.
@@ -317,6 +458,10 @@ def from_files(self) -> Iterator[Dataset]:
         Supplementary datasets will in inherit the facet values from the main
         dataset for those facets listed in :obj:`INHERITED_FACETS`.
 
+        This also works for :ref:`derived variables <Variable derivation>`. The
+        datasets required for derivation can be accessed via
+        :attr:`Dataset.required_datasets`.
+
         Examples
         --------
         See :doc:`/notebooks/discovering-data` notebook for example use cases.
@@ -326,52 +471,66 @@ def from_files(self) -> Iterator[Dataset]:
         Dataset
             Datasets representing the available files.
         """
+        # No wildcards present -> simply return self with expanded
+        # supplementaries
+        if not any(_isglob(v) for v in self.facets.values()):
+            self._supplementaries_from_files()
+            yield self
+            return
+
+        # Wildcards present -> expand them
         expanded = False
-        if any(_isglob(v) for v in self.facets.values()):
-            if _isglob(self.facets["mip"]):
-                available_mips = _get_mips(
-                    self.facets["project"],  # type: ignore
-                    self.facets["short_name"],  # type: ignore
+        if _isglob(self.facets["mip"]):
+            available_mips = _get_mips(
+                self.facets["project"],  # type: ignore
+                self.facets["short_name"],  # type: ignore
+            )
+            mips = [
+                mip
+                for mip in available_mips
+                if _ismatch(mip, self.facets["mip"])
+            ]
+        else:
+            mips = [self.facets["mip"]]  # type: ignore
+
+        for mip in mips:
+            if _isglob(self.facets.get("branding_suffix", "")):
+                available_branding_suffixes = _get_branding_suffixes(
+                    project=self.facets["project"],  # type: ignore[arg-type]
+                    mip=mip,
+                    short_name=self.facets["short_name"],  # type: ignore[arg-type]
                 )
-                mips = [
-                    mip
-                    for mip in available_mips
-                    if _ismatch(mip, self.facets["mip"])
+                branding_suffixes = [
+                    branding_suffix
+                    for branding_suffix in available_branding_suffixes
+                    if _ismatch(
+                        branding_suffix,
+                        self.facets["branding_suffix"],
+                    )
+                ]
+                dataset_templates = [
+                    self.copy(mip=mip, branding_suffix=branding_suffix)
+                    for branding_suffix in branding_suffixes
                 ]
             else:
-                mips = [self.facets["mip"]]  # type: ignore
-
-            for mip in mips:
-                if _isglob(self.facets.get("branding_suffix", "")):
-                    available_branding_suffixes = _get_branding_suffixes(
-                        project=self.facets["project"],  # type: ignore[arg-type]
-                        mip=mip,
-                        short_name=self.facets["short_name"],  # type: ignore[arg-type]
-                    )
-                    branding_suffixes = [
-                        branding_suffix
-                        for branding_suffix in available_branding_suffixes
-                        if _ismatch(
-                            branding_suffix,
-                            self.facets["branding_suffix"],
-                        )
-                    ]
-                    dataset_templates = [
-                        self.copy(mip=mip, branding_suffix=branding_suffix)
-                        for branding_suffix in branding_suffixes
-                    ]
-                else:
-                    dataset_templates = [self.copy(mip=mip)]
-                for dataset_template in dataset_templates:
-                    for dataset in dataset_template._get_available_datasets():  # noqa: SLF001
-                        expanded = True
-                        yield dataset
+                dataset_templates = [self.copy(mip=mip)]
+            for dataset_template in dataset_templates:
+                for dataset in self._get_all_available_datasets(
+                    dataset_template,
+                ):
+                    dataset._supplementaries_from_files()  # noqa: SLF001
+                    expanded = True
+                    yield dataset
 
+        # If files were found, or the file facets didn't match the
+        # specification, yield the original, but do expand any supplementary
+        # globs. For derived variables, make sure to purge any files found for
+        # required variables; those won't match in their facets.
         if not expanded:
-            # If the definition contains no wildcards, no files were found,
-            # or the file facets didn't match the specification, yield the
-            # original, but do expand any supplementary globs.
             self._supplementaries_from_files()
+            if self._derivation_necessary():
+                for required_dataset in self.required_datasets:
+                    required_dataset.files = []
             yield self
 
     def _supplementaries_from_files(self) -> None:
@@ -638,15 +797,29 @@ def minimal_facets(self) -> Facets:
         """Return a dictionary with the persistent facets."""
         return {k: v for k, v in self.facets.items() if k in self._persist}
 
+    @staticmethod
+    def _get_version(dataset: Dataset) -> str | list[str]:
+        """Get available version(s) of dataset."""
+        versions: set[str] = set()
+        for file in dataset.files:
+            if "version" in file.facets:
+                versions.add(str(file.facets["version"]))
+        return versions.pop() if len(versions) == 1 else sorted(versions)
+
     def set_version(self) -> None:
         """Set the ``'version'`` facet based on the available data."""
         versions: set[str] = set()
-        for file in self.files:
-            if "version" in file.facets:
-                versions.add(file.facets["version"])  # type: ignore
+        for required_dataset in self.required_datasets:
+            version = self._get_version(required_dataset)
+            if version:
+                if isinstance(version, list):
+                    versions.update(version)
+                else:
+                    versions.add(version)
         version = versions.pop() if len(versions) == 1 else sorted(versions)
         if version:
             self.set_facet("version", version)
+
         for supplementary_ds in self.supplementaries:
             supplementary_ds.set_version()
 
@@ -1007,8 +1180,9 @@ def _update_timerange(self) -> None:
             dataset = self.copy()
             dataset.facets.pop("timerange")
             dataset.supplementaries = []
-            check.data_availability(dataset)
-            if all("timerange" in f.facets for f in dataset.files):
+            if dataset.files and all(
+                "timerange" in f.facets for f in dataset.files
+            ):
                 # "timerange" can only be reliably computed when all DataElements
                 # provide it.
                 intervals = [
diff --git a/esmvalcore/preprocessor/_derive/__init__.py b/esmvalcore/preprocessor/_derive/__init__.py
index 3a25d1f9a4..3817b86bc7 100644
--- a/esmvalcore/preprocessor/_derive/__init__.py
+++ b/esmvalcore/preprocessor/_derive/__init__.py
@@ -13,6 +13,8 @@
 from esmvalcore.preprocessor._units import convert_units
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cf_units import Unit
     from iris.cube import Cube
 
@@ -77,7 +79,7 @@ def get_required(short_name: str, project: str) -> list[Facets]:
 
 
 def derive(
-    cubes: CubeList,
+    cubes: Sequence[Cube],
     short_name: str,
     long_name: str,
     units: str | Unit,
@@ -88,8 +90,7 @@ def derive(
     Parameters
     ----------
     cubes:
-        Includes all the needed variables for derivation defined in
-        :func:`get_required`.
+        Includes all the needed variables for derivation.
     short_name:
         short_name
     long_name:
@@ -103,6 +104,38 @@ def derive(
     -------
     iris.cube.Cube
         The new derived variable.
+
+    Examples
+    --------
+    Required variables for derivation can be obtained via
+    :attr:`esmvalcore.dataset.Dataset.required_datasets`.
+
+    For example, to derive the longwave cloud radiative effect (LWCRE) for the
+    model CESM2, you can use:
+
+    >>> from esmvalcore.dataset import Dataset
+    >>> from esmvalcore.preprocessor import derive
+    >>> dataset = Dataset(
+    ...     project="CMIP6",
+    ...     dataset="CESM2",
+    ...     exp="historical",
+    ...     ensemble="r1i1p1f1",
+    ...     grid="gn",
+    ...     timerange="2000/2014",
+    ...     short_name="lwcre",
+    ...     mip="Amon",
+    ...     derive=True,
+    ... )
+    >>> cubes = [d.load() for d in dataset.required_datasets]
+    >>> cube = derive(
+    ...     cubes,
+    ...     short_name="lwcre",
+    ...     long_name="TOA Longwave Cloud Radiative Effect",
+    ...     units="W m-2",
+    ... )
+    >>> print(cube.var_name)
+    lwcre  # doctest: +SKIP
+
     """
     if short_name == cubes[0].var_name:
         return cubes[0]
diff --git a/esmvalcore/preprocessor/_derive/amoc.py b/esmvalcore/preprocessor/_derive/amoc.py
index 3607aa1d62..67b179f0dd 100644
--- a/esmvalcore/preprocessor/_derive/amoc.py
+++ b/esmvalcore/preprocessor/_derive/amoc.py
@@ -72,9 +72,7 @@ def calculate(cubes):
                 f"Amoc calculation: {cube_orig} doesn't contain"
                 f" atlantic_arctic_ocean."
             )
-            raise ValueError(
-                msg,
-            )
+            raise ValueError(msg)
 
         # 2: Remove the shallowest 500m to avoid wind driven mixed layer.
         depth_constraint = iris.Constraint(depth=lambda d: d >= 500.0)
diff --git a/esmvalcore/preprocessor/_derive/siextent.py b/esmvalcore/preprocessor/_derive/siextent.py
index 27aee25aec..5bd2ca82f1 100644
--- a/esmvalcore/preprocessor/_derive/siextent.py
+++ b/esmvalcore/preprocessor/_derive/siextent.py
@@ -20,8 +20,8 @@ class DerivedVariable(DerivedVariableBase):
     def required(project):  # noqa: ARG004
         """Declare the variables needed for derivation."""
         return [
-            {"short_name": "sic", "optional": "true"},
-            {"short_name": "siconca", "optional": "true"},
+            {"short_name": "sic", "optional": True},
+            {"short_name": "siconca", "optional": True},
         ]
 
     @staticmethod
@@ -53,9 +53,7 @@ def calculate(cubes):
                     "Derivation of siextent failed due to missing variables "
                     "sic and siconca."
                 )
-                raise RecipeError(
-                    msg,
-                ) from exc
+                raise RecipeError(msg) from exc
 
         ones = da.ones_like(sic)
         siextent_data = da.ma.masked_where(sic.lazy_data() < 15.0, ones)
diff --git a/notebooks/discovering-data.ipynb b/notebooks/discovering-data.ipynb
index d6c9001ef2..10676a8b72 100644
--- a/notebooks/discovering-data.ipynb
+++ b/notebooks/discovering-data.ipynb
@@ -13,14 +13,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "f0ccfe7f-c535-4606-99ce-be24960aece1",
    "metadata": {},
    "outputs": [],
    "source": [
     "from esmvalcore.config import CFG\n",
-    "from esmvalcore.dataset import Dataset\n",
-    "from esmvalcore.esgf import download"
+    "from esmvalcore.dataset import Dataset"
    ]
   },
   {
@@ -39,7 +38,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "CFG[\"search_esgf\"] = \"always\""
+    "CFG[\"search_data\"] = \"complete\"\n",
+    "CFG[\"projects\"].pop(\"CMIP6\", None)  # Clear existing CMIP6 configuration\n",
+    "CFG.nested_update(\n",
+    "    {\n",
+    "        \"projects\": {\n",
+    "            \"CMIP6\": {\n",
+    "                \"data\": {\n",
+    "                    \"intake-esgf\": {\n",
+    "                        \"type\": \"esmvalcore.io.intake_esgf.IntakeESGFDataSource\",\n",
+    "                        \"priority\": 2,\n",
+    "                        \"facets\": {\n",
+    "                            \"activity\": \"activity_drs\",\n",
+    "                            \"dataset\": \"source_id\",\n",
+    "                            \"ensemble\": \"member_id\",\n",
+    "                            \"exp\": \"experiment_id\",\n",
+    "                            \"institute\": \"institution_id\",\n",
+    "                            \"grid\": \"grid_label\",\n",
+    "                            \"mip\": \"table_id\",\n",
+    "                            \"project\": \"project\",\n",
+    "                            \"short_name\": \"variable_id\",\n",
+    "                        },\n",
+    "                    },\n",
+    "                },\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    ")"
    ]
   },
   {
@@ -89,7 +114,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 778 datasets, showing the first 10:\n"
+      "Found 906 datasets, showing the first 10:\n"
      ]
     },
     {
@@ -253,7 +278,7 @@
     {
      "data": {
       "text/plain": [
-       "[ESGFFile:CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc on hosts ['esgf-data1.llnl.gov', 'esgf.ceda.ac.uk', 'esgf.rcec.sinica.edu.tw', 'esgf3.dkrz.de', 'esgf-data04.diasjp.net', 'esgf.nci.org.au', 'esgf3.dkrz.de']]"
+       "[IntakeESGFDataset(name='CMIP6.CMIP.AS-RCEC.TaiESM1.historical.r1i1p1f1.Amon.tas.gn')]"
       ]
      },
      "execution_count": 6,
@@ -270,7 +295,7 @@
    "id": "60d88a34-c886-4b9d-a9e9-a9d18fa97917",
    "metadata": {},
    "source": [
-    "A single file can be downloaded using its `download` method:"
+    "Load a single file as `iris.cube.CubeList`:"
    ]
   },
   {
@@ -281,8 +306,402 @@
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "    .accordion-140017080135872 {\n",
+       "        color: var(--jp-ui-font-color2);\n",
+       "        background: var(--jp-layout-color2);\n",
+       "        cursor: pointer;\n",
+       "        padding: 10px;\n",
+       "        border: 1px solid var(--jp-border-color0);\n",
+       "        width: 100%;\n",
+       "        text-align: left;\n",
+       "        font-size: 14px;\n",
+       "        font-family: var(--jp-code-font-family);\n",
+       "        font-weight: normal;\n",
+       "        outline: none;\n",
+       "        transition: 0.4s;\n",
+       "    }\n",
+       "    .active {\n",
+       "        background: var(--jp-layout-color1);\n",
+       "        font-weight: 900;\n",
+       "    }\n",
+       "    .accordion-140017080135872.active {\n",
+       "        border: 1px solid var(--jp-brand-color1) !important;\n",
+       "    }\n",
+       "    .accordion-140017080135872:hover {\n",
+       "        box-shadow: var(--jp-input-box-shadow);\n",
+       "        border: 2px solid var(--jp-brand-color1);\n",
+       "    }\n",
+       "    .panel-140017080135872 {\n",
+       "        padding: 0 18px;\n",
+       "        margin-bottom: 5px;\n",
+       "        background-color: var(--jp-layout-color1);\n",
+       "        display: none;\n",
+       "        overflow: hidden;\n",
+       "        border: 1px solid var(--jp-brand-color2);\n",
+       "    }\n",
+       "</style>\n",
+       "<script type=\"text/javascript\">\n",
+       "    var accordion = document.getElementsByClassName(\"accordion-140017080135872\");\n",
+       "    var i;\n",
+       "\n",
+       "    for (i = 0; i < accordion.length; i++) {\n",
+       "        accordion[i].addEventListener(\"click\", function() {\n",
+       "            this.classList.toggle(\"active\");\n",
+       "\n",
+       "            var panel = this.nextElementSibling;\n",
+       "            if (panel.style.display === \"block\") {\n",
+       "                panel.style.display = \"none\";\n",
+       "            } else {\n",
+       "                panel.style.display = \"block\";\n",
+       "            }\n",
+       "        });\n",
+       "    }\n",
+       "</script>\n",
+       "\n",
+       "<button class=\"accordion-140017080135872\">0: air_temperature / (K)               (time: 1980; latitude: 192; longitude: 288)</button>\n",
+       "<div class=\"panel-140017080135872\">\n",
+       "    <p>\n",
+       "<style>\n",
+       "  a.iris {\n",
+       "      text-decoration: none !important;\n",
+       "  }\n",
+       "  table.iris {\n",
+       "      white-space: pre;\n",
+       "      border: 1px solid;\n",
+       "      border-color: #9c9c9c;\n",
+       "      font-family: monaco, monospace;\n",
+       "  }\n",
+       "  th.iris {\n",
+       "      background: #303f3f;\n",
+       "      color: #e0e0e0;\n",
+       "      border-left: 1px solid;\n",
+       "      border-color: #9c9c9c;\n",
+       "      font-size: 1.05em;\n",
+       "      min-width: 50px;\n",
+       "      max-width: 125px;\n",
+       "  }\n",
+       "  tr.iris :first-child {\n",
+       "      border-right: 1px solid #9c9c9c !important;\n",
+       "  }\n",
+       "  td.iris-title {\n",
+       "      background: #d5dcdf;\n",
+       "      border-top: 1px solid #9c9c9c;\n",
+       "      font-weight: bold;\n",
+       "  }\n",
+       "  .iris-word-cell {\n",
+       "      text-align: left !important;\n",
+       "      white-space: pre;\n",
+       "  }\n",
+       "  .iris-subheading-cell {\n",
+       "      padding-left: 2em !important;\n",
+       "  }\n",
+       "  .iris-inclusion-cell {\n",
+       "      padding-right: 1em !important;\n",
+       "  }\n",
+       "  .iris-panel-body {\n",
+       "      padding-top: 0px;\n",
+       "  }\n",
+       "  .iris-panel-title {\n",
+       "      padding-left: 3em;\n",
+       "  }\n",
+       "  .iris-panel-title {\n",
+       "      margin-top: 7px;\n",
+       "  }\n",
+       "</style>\n",
+       "<table class=\"iris\" id=\"140016978747280\">\n",
+       "    <tr class=\"iris\">\n",
+       "<th class=\"iris iris-word-cell\">Air Temperature (K)</th>\n",
+       "<th class=\"iris iris-word-cell\">time</th>\n",
+       "<th class=\"iris iris-word-cell\">latitude</th>\n",
+       "<th class=\"iris iris-word-cell\">longitude</th>\n",
+       "</tr>\n",
+       "    <tr class=\"iris\">\n",
+       "<td class=\"iris-word-cell iris-subheading-cell\">Shape</td>\n",
+       "<td class=\"iris iris-inclusion-cell\">1980</td>\n",
+       "<td class=\"iris iris-inclusion-cell\">192</td>\n",
+       "<td class=\"iris iris-inclusion-cell\">288</td>\n",
+       "</tr>\n",
+       "    <tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Dimension coordinates</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttime</td>\n",
+       "    <td class=\"iris-inclusion-cell\">x</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tlatitude</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">x</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tlongitude</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">-</td>\n",
+       "    <td class=\"iris-inclusion-cell\">x</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Scalar coordinates</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\theight</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">2.0 m</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Cell methods</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\t0</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">area: time: mean</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-title iris-word-cell\">Attributes</td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "    <td class=\"iris-title\"></td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tConventions</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CF-1.7 CMIP-6.2&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tactivity_drs</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tactivity_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_method</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Hybrid-restart from year 0671-01-01 of piControl&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_time</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.float64(0.0)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_time_in_child</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.float64(0.0)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tbranch_time_in_parent</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.float64(171550.0)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcmor_version</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;3.5.0&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcomment</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;near-surface (usually, 2 meter) air temperature&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcontact</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Dr. Wei-Liang Lee (leelupin@gate.sinica.edu.tw)&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tcreation_date</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;2020-06-08T08:53:23Z&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tdata_specs_version</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;01.00.31&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\texperiment</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;all-forcing simulation of the recent past&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\texperiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;historical&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\texternal_variables</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;areacella&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tforcing_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tfrequency</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;mon&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tfurther_info_url</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;https://furtherinfo.es-doc.org/CMIP6.AS-RCEC.TaiESM1.historical.none.r ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tgrid</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;finite-volume grid with 0.9x1.25 degree lat/lon resolution&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tgrid_label</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;gn&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\thistory</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&quot;2020-06-08T08:53:23Z altered by CMOR: Treated scalar dimension: &#x27;height&#x27;. ...&quot;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tinitialization_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tinstitution</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Research Center for Environmental Changes, Academia Sinica, Nankang, Taipei ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tinstitution_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;AS-RCEC&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tlicense</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP6 model data produced by NCC is licensed under a Creative Commons Attribution ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tmember_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;r1i1p1f1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tmip_era</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP6&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tmodel_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tnominal_resolution</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;100 km&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\toriginal_name</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TREFHT&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_activity_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_experiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;piControl&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_mip_era</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;CMIP6&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_source_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_sub_experiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;none&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_time_units</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;days since 1850-01-01&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tparent_variant_label</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;r1i1p1f1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tphysics_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tproduct</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;model-output&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\trealization_index</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">np.int32(1)</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\trealm</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;atmos&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\treferences</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;10.5194/gmd-2019-377&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\trun_variant</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;N/A&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM 1.0 (2018): \\naerosol: SNAP (same grid as atmos)\\natmos: TaiAM1 ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource_file</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;/mnt/d/data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn ...&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsource_type</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;AOGCM AER BGC&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsub_experiment</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;none&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tsub_experiment_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;none&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttable_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Amon&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttable_info</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;Creation Date:(24 July 2019) MD5:0bb394a356ef9d214d027f1aca45853e&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttitle</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;TaiESM1 output prepared for CMIP6&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\ttracking_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;hdl:21.14100/997cf563-6411-4a78-a9c4-7369ae27d698&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tvariable_id</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;tas&#x27;</td>\n",
+       "</tr>\n",
+       "<tr class=\"iris\">\n",
+       "    <td class=\"iris-word-cell iris-subheading-cell\">\tvariant_label</td>\n",
+       "    <td class=\"iris-word-cell\" colspan=\"3\">&#x27;r1i1p1f1&#x27;</td>\n",
+       "</tr>\n",
+       "</table>\n",
+       "        </p>\n",
+       "</div>\n",
+       "    \n",
+       "    "
+      ],
       "text/plain": [
-       "LocalFile('~/climate_data/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/Amon/tas/gn/v20200623/tas_Amon_TaiESM1_historical_r1i1p1f1_gn_185001-201412.nc')"
+       "[<iris 'Cube' of air_temperature / (K) (time: 1980; latitude: 192; longitude: 288)>]"
       ]
      },
      "execution_count": 7,
@@ -291,32 +710,243 @@
     }
    ],
    "source": [
-    "dataset.files[0].download(CFG[\"download_dir\"])"
+    "cubes = dataset.files[0].to_iris()\n",
+    "cubes"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
-   "id": "3821b594-3797-497b-a51d-1798d5b2fc80",
+   "id": "d3006d90",
    "metadata": {},
    "source": [
-    "For downloading many files, the [esmvalcore.esgf.download](https://docs.esmvaltool.org/projects/esmvalcore/en/latest/api/esmvalcore.esgf.html#esmvalcore.esgf.download) function is recommended because it will download the files in parallel. The ESMValCore will try to guess the fastest host and download from there. If it is not available for some reason, it will automatically fall back to the next host."
+    "`Dataset.from_files` can also handle derived variables properly:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "9676ff81-232e-4ff8-b784-686f0d06c469",
+   "id": "b75314e3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "download(dataset.files, CFG[\"download_dir\"])"
+    "dataset_template = Dataset(\n",
+    "    short_name=\"lwcre\",\n",
+    "    mip=\"Amon\",\n",
+    "    project=\"CMIP6\",\n",
+    "    exp=\"historical\",\n",
+    "    dataset=\"*\",\n",
+    "    institute=\"*\",\n",
+    "    ensemble=\"r1i1p1f1\",\n",
+    "    grid=\"gn\",\n",
+    "    derive=True,\n",
+    "    force_derivation=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b87c247f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 37 datasets, showing the first 10:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[Dataset:\n",
+       " {'dataset': 'GISS-E2-2-G',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'NASA-GISS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'FGOALS-g3',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CESM2-WACCM-FV2',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'NCAR'},\n",
+       " Dataset:\n",
+       " {'dataset': 'GISS-E2-1-H',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'NASA-GISS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'BCC-CSM2-MR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'BCC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CAS-ESM2-0',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'CAS'},\n",
+       " Dataset:\n",
+       " {'dataset': 'MPI-ESM-1-2-HAM',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'HAMMOZ-Consortium'},\n",
+       " Dataset:\n",
+       " {'dataset': 'CESM2-FV2',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'NCAR'},\n",
+       " Dataset:\n",
+       " {'dataset': 'BCC-ESM1',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'BCC'},\n",
+       " Dataset:\n",
+       " {'dataset': 'ICON-ESM-LR',\n",
+       "  'project': 'CMIP6',\n",
+       "  'mip': 'Amon',\n",
+       "  'short_name': 'lwcre',\n",
+       "  'derive': True,\n",
+       "  'ensemble': 'r1i1p1f1',\n",
+       "  'exp': 'historical',\n",
+       "  'force_derivation': True,\n",
+       "  'grid': 'gn',\n",
+       "  'institute': 'MPI-M'}]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "datasets = list(dataset_template.from_files())\n",
+    "print(f\"Found {len(datasets)} datasets, showing the first 10:\")\n",
+    "datasets[:10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18e3a0b7",
+   "metadata": {},
+   "source": [
+    "The facet `force_derivation=True` ensures variable derivation. If omitted and files that provide the variable `lwcre` without derivation are present, only those are returned."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f00a886f",
+   "metadata": {},
+   "source": [
+    "If variable derivation is necessary (this will always be the case if `force_derivation=True` is used), the `files` attribute of the datasets may be empty. In this case, the input files of the input variables necessary for derivation can be accessed via the `Dataset.input_datasets` attribute:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c5edfa65",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = datasets[0]\n",
+    "dataset.files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "97cdf12d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "rlut\n",
+      "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlut.gn')]\n",
+      "rlutcs\n",
+      "[IntakeESGFDataset(name='CMIP6.CMIP.NASA-GISS.GISS-E2-2-G.historical.r1i1p1f1.Amon.rlutcs.gn')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for d in dataset.required_datasets:\n",
+    "    print(d[\"short_name\"])\n",
+    "    print(d.files)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "esm",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -330,12 +960,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "17e81e49408864327be43d3caebcb8eca32ff92a01becb15aa27be73c37f0517"
-   }
+   "version": "3.13.11"
   }
  },
  "nbformat": 4,
diff --git a/tests/conftest.py b/tests/conftest.py
index 46cabf58f9..3c19e4c4df 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import importlib
 import warnings
 from copy import deepcopy
 from functools import lru_cache
@@ -7,6 +8,7 @@
 
 import numpy as np
 import pytest
+import yaml
 from cf_units import Unit
 from iris.coords import (
     AncillaryVariable,
@@ -17,6 +19,7 @@
 )
 from iris.cube import Cube
 
+import esmvalcore
 from esmvalcore.config import CFG, Config
 
 if TYPE_CHECKING:
@@ -55,6 +58,33 @@ def ignore_existing_user_config(
     monkeypatch.setattr(CFG, "_mapping", cfg_default._mapping)
 
 
+@lru_cache
+def _load_default_data_sources() -> dict[
+    str,
+    dict[str, dict[str, dict[str, dict[str, str]]]],
+]:
+    """Load default data sources for local users."""
+    cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = {
+        "projects": {},
+    }
+    for file in (
+        "data-local.yml",
+        "data-local-esmvaltool.yml",
+        "data-native-cesm.yml",
+        "data-native-emac.yml",
+        "data-native-icon.yml",
+        "data-native-ipslcm.yml",
+    ):
+        with importlib.resources.as_file(
+            importlib.resources.files(esmvalcore.config)
+            / "configurations"
+            / file,
+        ) as config_file:
+            content = config_file.read_text(encoding="utf-8")
+            cfg["projects"].update(yaml.safe_load(content)["projects"])
+    return cfg
+
+
 @pytest.fixture
 def session(
     tmp_path: Path,
@@ -63,7 +93,15 @@ def session(
 ) -> Session:
     """Session object with default settings."""
     monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output")
-    return CFG.start_session("recipe_test")
+    session = CFG.start_session("recipe_test")
+    projects = _load_default_data_sources()["projects"]
+    for project in projects:
+        print(project)
+        data_sources = projects[project]["data"]
+        for data_source in data_sources.values():
+            data_source["rootpath"] = str(tmp_path)
+        session["projects"][project]["data"] = data_sources
+    return session
 
 
 @pytest.fixture
diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py
index 44171a76ad..e801789e70 100644
--- a/tests/integration/recipe/test_check.py
+++ b/tests/integration/recipe/test_check.py
@@ -272,27 +272,6 @@ def test_valid_time_selection_rejections(timerange, message):
     assert str(rec_err.value) == message
 
 
-def test_differing_timeranges(caplog):
-    timeranges = set()
-    timeranges.add("1950/1951")
-    timeranges.add("1950/1952")
-    required_variables = [
-        {"short_name": "rsdscs", "timerange": "1950/1951"},
-        {"short_name": "rsuscs", "timerange": "1950/1952"},
-    ]
-    with pytest.raises(ValueError) as exc:
-        check.differing_timeranges(timeranges, required_variables)
-    expected_log = (
-        f"Differing timeranges with values {timeranges} "
-        "found for required variables "
-        "[{'short_name': 'rsdscs', 'timerange': '1950/1951'}, "
-        "{'short_name': 'rsuscs', 'timerange': '1950/1952'}]. "
-        "Set `timerange` to a common value."
-    )
-
-    assert expected_log in str(exc.value)
-
-
 def test_data_availability_nonexistent(tmp_path):
     var = {
         "dataset": "ABC",
diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py
index b87a696387..565f347b3b 100644
--- a/tests/integration/recipe/test_recipe.py
+++ b/tests/integration/recipe/test_recipe.py
@@ -22,7 +22,7 @@
 import esmvalcore.io.esgf
 import esmvalcore.io.local
 from esmvalcore._recipe.recipe import (
-    _get_input_datasets,
+    _get_required_datasets,
     _representative_datasets,
     read_recipe_file,
 )
@@ -182,7 +182,7 @@ def get_required(short_name, _):
         ]
 
     monkeypatch.setattr(
-        esmvalcore._recipe.to_datasets,
+        esmvalcore.dataset,
         "get_required",
         get_required,
     )
@@ -1707,7 +1707,7 @@ def test_alias_generation(tmp_path, patched_datafinder, session):  # noqa: C901,
                 assert dataset["alias"] == "CORDEX_ICHEC-EC-EARTH"
             else:
                 assert dataset["alias"] == "CORDEX_MIROC-MIROC5"
-        elif dataset["version"] == 1:
+        elif dataset["version"] == "1":
             assert dataset["alias"] == "OBS_1"
         else:
             assert dataset["alias"] == "OBS_2"
@@ -2599,9 +2599,7 @@ def test_representative_dataset_derived_var(
     expected_facets: Facets = {
         # Already present in variable
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": force_derivation,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2611,6 +2609,9 @@ def test_representative_dataset_derived_var(
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_required_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     if force_derivation:
         expected_datasets = [
@@ -2665,9 +2666,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "short_name": "rsdscs",
         # Already present in variables
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": True,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2680,6 +2679,9 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_required_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     rsdscs = Dataset(**rsdscs_facets)
     rsdscs.session = session
@@ -2689,9 +2691,7 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "short_name": "rsuscs",
         # Already present in variables
         "dataset": "ICON",
-        "derive": True,
         "exp": "atm_amip-rad_R2B4_r1i1p1f1",
-        "force_derivation": True,
         "frequency": "mon",
         "mip": "Amon",
         "project": "ICON",
@@ -2704,11 +2704,14 @@ def test_get_derive_input_variables(patched_datafinder, session):
         "units": "W m-2",
         # Added by _add_extra_facets
         "var_type": "atm_2d_ml",
+        # Added/changed by Dataset._get_required_datasets()
+        "derive": False,
+        "force_derivation": False,
     }
     rsuscs = Dataset(**rsuscs_facets)
     rsuscs.session = session
 
-    alb_derive_input = _get_input_datasets(alb)
+    alb_derive_input = _get_required_datasets(alb)
     assert alb_derive_input == [rsdscs, rsuscs]
 
 
diff --git a/tests/unit/preprocessor/_derive/test_siextent.py b/tests/unit/preprocessor/_derive/test_siextent.py
index ae9f5d1c8f..416c9ac17b 100644
--- a/tests/unit/preprocessor/_derive/test_siextent.py
+++ b/tests/unit/preprocessor/_derive/test_siextent.py
@@ -113,6 +113,6 @@ def test_siextent_required():
     derived_var = siextent.DerivedVariable()
     output = derived_var.required(None)
     assert output == [
-        {"short_name": "sic", "optional": "true"},
-        {"short_name": "siconca", "optional": "true"},
+        {"short_name": "sic", "optional": True},
+        {"short_name": "siconca", "optional": True},
     ]
diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py
index 6ed350c34d..fd19bc21a3 100644
--- a/tests/unit/recipe/test_recipe.py
+++ b/tests/unit/recipe/test_recipe.py
@@ -865,28 +865,6 @@ def test_get_default_settings(mocker):
     }
 
 
-def test_set_version(mocker):
-    dataset = Dataset(short_name="tas")
-    supplementary = Dataset(short_name="areacella")
-    dataset.supplementaries = [supplementary]
-
-    input_dataset = Dataset(short_name="tas")
-    file1 = mocker.Mock()
-    file1.facets = {"version": "v1"}
-    file2 = mocker.Mock()
-    file2.facets = {"version": "v2"}
-    input_dataset.files = [file1, file2]
-
-    file3 = mocker.Mock()
-    file3.facets = {"version": "v3"}
-    supplementary.files = [file3]
-
-    _recipe._set_version(dataset, [input_dataset])
-    print(dataset)
-    assert dataset.facets["version"] == ["v1", "v2"]
-    assert dataset.supplementaries[0].facets["version"] == "v3"
-
-
 def test_extract_preprocessor_order():
     profile = {
         "custom_order": True,
@@ -956,3 +934,23 @@ def test_special_name_to_dataset_invalid_special_name_type():
     )
     with pytest.raises(RecipeError, match=msg):
         _recipe._special_name_to_dataset(facets, "reference_dataset")
+
+
+def test_fix_cmip5_fx_ensemble(monkeypatch):
+    def find_files(self):
+        if self.facets["ensemble"] == "r0i0p0":
+            self._files = ["file1.nc"]
+
+    monkeypatch.setattr(Dataset, "find_files", find_files)
+
+    dataset = Dataset(
+        dataset="dataset1",
+        short_name="orog",
+        mip="fx",
+        project="CMIP5",
+        ensemble="r1i1p1",
+    )
+
+    _recipe._fix_cmip5_fx_ensemble(dataset)
+
+    assert dataset["ensemble"] == "r0i0p0"
diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py
index c1834d002e..877d7681e8 100644
--- a/tests/unit/recipe/test_to_datasets.py
+++ b/tests/unit/recipe/test_to_datasets.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import textwrap
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -284,7 +285,7 @@ def test_merge_supplementaries_missing_short_name_fails(session):
         Dataset.from_recipe(recipe_txt, session)
 
 
-def test_get_input_datasets_derive(session):
+def test_get_required_datasets_derive(session):
     dataset = Dataset(
         dataset="ERA5",
         project="native6",
@@ -299,7 +300,7 @@ def test_get_input_datasets_derive(session):
         type="reanaly",
         version="v1",
     )
-    rlds, rlns = to_datasets._get_input_datasets(dataset)
+    rlds, rlns = to_datasets._get_required_datasets(dataset)
     assert rlds["short_name"] == "rlds"
     assert rlds["long_name"] == "Surface Downwelling Longwave Radiation"
     assert rlds["frequency"] == "1hr"
@@ -308,6 +309,57 @@ def test_get_input_datasets_derive(session):
     assert rlns["frequency"] == "1hr"
 
 
+def test_get_required_datasets_optional(caplog, tmp_path, session):
+    facets = {
+        "project": "OBS6",
+        "dataset": "SAT",
+        "mip": "SImon",
+        "short_name": "siextent",
+        "tier": 2,
+        "type": "sat",
+        "timerange": "1980/2000",
+        "derive": True,
+    }
+
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    sic_file = LocalFile(
+        input_dir / "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc",
+    )
+    sic_file.touch()
+
+    dataset = Dataset(**facets)
+    dataset.files = []
+    dataset.session = session
+
+    with caplog.at_level(logging.INFO):
+        datasets = to_datasets._get_required_datasets(dataset)
+
+    expected = Dataset(
+        dataset="SAT",
+        project="OBS6",
+        mip="SImon",
+        short_name="siconca",
+        derive=False,
+        frequency="mon",
+        long_name="Sea-Ice Area Percentage (Atmospheric Grid)",
+        modeling_realm=["seaIce"],
+        optional=True,
+        original_short_name="siconca",
+        standard_name="sea_ice_area_fraction",
+        tier=2,
+        timerange="1980/2000",
+        type="sat",
+        units="%",
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+
+    logger_infos = [r.message for r in caplog.records if r.levelname == "INFO"]
+    assert "which is marked as 'optional'" in logger_infos[-1]
+
+
 def test_max_years(session):
     recipe_txt = textwrap.dedent("""
     diagnostics:
@@ -355,26 +407,6 @@ def from_files(_):
         to_datasets._dataset_from_files(dataset)
 
 
-def test_fix_cmip5_fx_ensemble(monkeypatch):
-    def find_files(self):
-        if self.facets["ensemble"] == "r0i0p0":
-            self._files = ["file1.nc"]
-
-    monkeypatch.setattr(Dataset, "find_files", find_files)
-
-    dataset = Dataset(
-        dataset="dataset1",
-        short_name="orog",
-        mip="fx",
-        project="CMIP5",
-        ensemble="r1i1p1",
-    )
-
-    to_datasets._fix_cmip5_fx_ensemble(dataset)
-
-    assert dataset["ensemble"] == "r0i0p0"
-
-
 def test_get_supplementary_short_names(monkeypatch):
     def _update_cmor_facets(facets):
         facets["modeling_realm"] = "atmos"
diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py
index 90e5ff50fb..f70fd551f0 100644
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@@ -1,15 +1,12 @@
 from __future__ import annotations
 
-import importlib.resources
 import textwrap
 from collections import defaultdict
-from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pyesgf
 import pytest
-import yaml
 
 import esmvalcore.dataset
 import esmvalcore.io.esgf
@@ -19,6 +16,7 @@
 from esmvalcore.dataset import Dataset
 from esmvalcore.exceptions import InputFilesNotFound, RecipeError
 from esmvalcore.io.esgf import ESGFFile
+from esmvalcore.preprocessor._derive._baseclass import DerivedVariableBase
 
 if TYPE_CHECKING:
     from pytest_mock import MockerFixture
@@ -26,45 +24,6 @@
     from esmvalcore.typing import Facets
 
 
-@lru_cache
-def _load_default_data_sources() -> dict[
-    str,
-    dict[str, dict[str, dict[str, dict[str, str]]]],
-]:
-    """Load default data sources for local users."""
-    cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = {
-        "projects": {},
-    }
-    for file in (
-        "data-local.yml",
-        "data-local-esmvaltool.yml",
-        "data-native-cesm.yml",
-        "data-native-emac.yml",
-        "data-native-icon.yml",
-        "data-native-ipslcm.yml",
-    ):
-        with importlib.resources.as_file(
-            importlib.resources.files(esmvalcore.config)
-            / "configurations"
-            / file,
-        ) as config_file:
-            content = config_file.read_text(encoding="utf-8")
-            cfg["projects"].update(yaml.safe_load(content)["projects"])
-    return cfg
-
-
-@pytest.fixture
-def session(tmp_path: Path, session: Session) -> Session:
-    """Session fixture with default local data sources."""
-    projects = _load_default_data_sources()["projects"]
-    for project in projects:
-        data_sources = projects[project]["data"]
-        for data_source in data_sources.values():
-            data_source["rootpath"] = str(tmp_path)
-        session["projects"][project]["data"] = data_sources
-    return session
-
-
 def test_repr():
     ds = Dataset(short_name="tas", dataset="dataset1")
 
@@ -1365,211 +1324,1303 @@ def test_from_files_with_globs_and_only_missing_facets(monkeypatch, session):
     assert datasets == [expected]
 
 
-def test_match():
-    dataset1 = Dataset(
-        short_name="areacella",
-        ensemble=["r1i1p1f1"],
-        exp="historical",
-        modeling_realm=["atmos", "land"],
-    )
-    dataset2 = Dataset(
-        short_name="tas",
-        ensemble="r1i1p1f1",
-        exp=["historical", "ssp585"],
-        modeling_realm=["atmos"],
-    )
+OBS6_SAT_FACETS: Facets = {
+    "project": "OBS6",
+    "dataset": "SAT",
+    "mip": "Amon",
+    "tier": 2,
+    "type": "sat",
+    "timerange": "1980/2000",
+}
 
-    score = dataset1._match(dataset2)
-    assert score == 3
 
+def test_from_files_no_files_glob(session):
+    dataset = Dataset(**{**OBS6_SAT_FACETS, "type": "*"}, short_name="tas")
+    datasets = list(dataset.from_files())
+    assert datasets == [dataset]
 
-def test_remove_duplicate_supplementaries():
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_files_glob(timerange, session):
     dataset = Dataset(
-        dataset="dataset1",
-        short_name="tas",
-        mip="Amon",
-        project="CMIP6",
-        exp="historical",
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
     )
-    supplementary1 = dataset.copy(short_name="areacella")
-    supplementary2 = supplementary1.copy()
-    supplementary1.facets["exp"] = "1pctCO2"
-    dataset.supplementaries = [supplementary1, supplementary2]
+    datasets = list(dataset.from_files())
+    assert datasets == [dataset]
 
-    dataset._remove_duplicate_supplementaries()
 
-    assert len(dataset.supplementaries) == 1
-    assert dataset.supplementaries[0] == supplementary2
+@pytest.fixture
+def lwcre_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre.touch()
+    return lwcre
 
 
-def test_remove_not_found_supplementaries():
-    dataset = Dataset(
-        dataset="dataset1",
-        short_name="tas",
-        mip="Amon",
-        project="CMIP6",
-        exp="historical",
+@pytest.fixture
+def lwcre_file_ground(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_ground_1_Amon_lwcre_1980-2000.nc",
     )
-    dataset.add_supplementary(short_name="areacella", mip="fx", exp="*")
-    dataset._remove_unexpanded_supplementaries()
+    lwcre.touch()
+    return lwcre
 
-    assert len(dataset.supplementaries) == 0
 
+@pytest.fixture
+def rlut_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlut_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
 
-def test_concatenating_historical_and_future_exps(mocker):
-    mocker.patch.object(Dataset, "files", True)
-    dataset = Dataset(
-        dataset="dataset1",
-        short_name="tas",
-        mip="Amon",
-        frequency="mon",
-        project="CMIP6",
-        exp=["historical", "ssp585"],
+
+@pytest.fixture
+def rlut_file_future(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlut_2100-2101.nc",
     )
-    dataset.add_supplementary(short_name="areacella", mip="fx", frequency="fx")
-    dataset._fix_fx_exp()
+    rlut.touch()
+    return rlut
 
-    assert len(dataset.supplementaries) == 1
-    assert dataset.facets["exp"] == ["historical", "ssp585"]
-    assert dataset.supplementaries[0].facets["exp"] == "historical"
 
+@pytest.fixture
+def rlut_file_ground(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_ground_1_Amon_rlut_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
 
-def test_from_recipe_with_glob(tmp_path: Path, session: Session) -> None:
-    recipe_txt = textwrap.dedent("""
 
-    diagnostics:
-      diagnostic1:
-        variables:
-          tas:
-            project: CMIP5
-            mip: Amon
-            exp: rcp85
-            ensemble: r1i1p1
-            additional_datasets:
-              - {dataset: '*', institute: '*'}
-    """)
-    recipe = tmp_path / "recipe_test.yml"
-    recipe.write_text(recipe_txt, encoding="utf-8")
+@pytest.fixture
+def rlutcs_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlutcs = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_rlutcs_1980-2000.nc",
+    )
+    rlutcs.touch()
+    return rlutcs
 
-    filenames = [
-        "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/"
-        "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc",
-        "cmip5/output1/NIMR-KMA/HadGEM2-AO/rcp85/mon/atmos/Amon/r1i1p1/"
-        "v20130815/tas_Amon_HadGEM2-AO_rcp85_r1i1p1_186001-200512.nc",
-    ]
-    for filename in filenames:
-        path = tmp_path / filename
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text("")
 
-    definitions: list[Facets] = [
-        {
-            "diagnostic": "diagnostic1",
-            "variable_group": "tas",
-            "dataset": "CSIRO-Mk3-6-0",
-            "project": "CMIP5",
-            "mip": "Amon",
-            "short_name": "tas",
-            "alias": "CSIRO-Mk3-6-0",
-            "recipe_dataset_index": 0,
-            "exp": "rcp85",
-            "ensemble": "r1i1p1",
-            "institute": "CSIRO-QCCCE",
-        },
-        {
-            "diagnostic": "diagnostic1",
-            "variable_group": "tas",
-            "dataset": "HadGEM2-AO",
-            "project": "CMIP5",
-            "mip": "Amon",
-            "short_name": "tas",
-            "alias": "HadGEM2-AO",
-            "recipe_dataset_index": 1,
-            "exp": "rcp85",
-            "ensemble": "r1i1p1",
-            "institute": "NIMR-KMA",
-        },
-    ]
-    expected = []
-    for facets in definitions:
-        dataset = Dataset(**facets)
-        dataset.session = session
-        expected.append(dataset)
+@pytest.fixture
+def pr_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    pr = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_Amon_pr_1980-2000.nc",
+    )
+    pr.touch()
+    return pr
 
-    datasets = Dataset.from_recipe(recipe, session)
-    print("Expected:", expected)
-    print("Got:", datasets)
-    assert all(ds.session == session for ds in datasets)
-    assert datasets == expected
 
+@pytest.fixture
+def siconca_file(tmp_path):
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    rlut = esmvalcore.local.LocalFile(
+        input_dir,
+        "OBS6_SAT_sat_1_SImon_siconca_1980-2000.nc",
+    )
+    rlut.touch()
+    return rlut
 
-def test_from_ranges():
-    dataset = Dataset(ensemble="r(1:2)i1p1f1")
-    expected = [
-        Dataset(ensemble="r1i1p1f1"),
-        Dataset(ensemble="r2i1p1f1"),
-    ]
-    assert dataset.from_ranges() == expected
 
+def test_from_files_with_derived_no_derivation(lwcre_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
 
-def test_expand_ensemble():
-    dataset = Dataset(ensemble="r(1:2)i(2:3)p(3:4)")
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
 
-    expanded = dataset._expand_range("ensemble")
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
 
-    ensembles = [
-        "r1i2p3",
-        "r1i2p4",
-        "r1i3p3",
-        "r1i3p4",
-        "r2i2p3",
-        "r2i2p4",
-        "r2i3p3",
-        "r2i3p4",
+    expected_required_dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        frequency="mon",
+        long_name="TOA Longwave Cloud Radiative Effect",
+        modeling_realm=["atmos"],
+        original_short_name="lwcre",
+        standard_name="",
+        units="W m-2",
+    )
+    expected_required_dataset.supplementaries = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
     ]
-    assert expanded == ensembles
+    expected_required_dataset.session = session
 
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == [expected_required_dataset]
+    assert required_datasets[0].files == [lwcre_file]
 
-def test_expand_subexperiment():
-    dataset = Dataset(sub_experiment="s(1998:2005)")
 
-    expanded = dataset._expand_range("sub_experiment")
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_derivation_glob(
+    timerange,
+    lwcre_file,
+    lwcre_file_ground,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
 
-    subexperiments = [
-        "s1998",
-        "s1999",
-        "s2000",
-        "s2001",
-        "s2002",
-        "s2003",
-        "s2004",
-        "s2005",
-    ]
+    datasets = list(dataset.from_files())
 
-    assert expanded == subexperiments
+    expected_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+        ),
+        Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True),
+    ]
+    for expected_ds in expected_datasets:
+        expected_ds.add_supplementary(short_name="pr", type="sat")
+        expected_ds.session = session
 
+    assert datasets == expected_datasets
+    assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+    assert datasets[1].files == [lwcre_file]
+    assert datasets[1].supplementaries[0].files == [pr_file]
 
-def test_expand_ensemble_list_ok():
-    dataset = Dataset(ensemble=["r0i0p0", "r1i1p1"])
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
 
-    expected = [["r0i0p0", "r1i1p1"]]
+    for dataset, expected in zip(
+        datasets,
+        expected_required_datasets,
+        strict=True,
+    ):
+        assert dataset.required_datasets == [expected]
+    assert datasets[0].required_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].required_datasets[0].files == [lwcre_file]
 
-    assert dataset._expand_range("ensemble") == expected
 
+def test_from_files_with_derived(rlut_file, rlutcs_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
 
-def test_expand_ensemble_nolist():
-    dataset = Dataset(
-        dataset="XYZ",
-        ensemble=["r1i1p1", "r(1:2)i1p1"],
-    )
+    datasets = list(dataset.from_files())
 
-    with pytest.raises(RecipeError):
-        dataset._expand_range("ensemble")
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
 
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == []
 
-def create_esgf_file(timerange, version):
-    """Prepare some fake ESGF search results."""
-    json = {
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+
+def test_from_files_with_derived_unavailable_years(
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+        short_name="lwcre",
+        derive=True,
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "2010/2015"},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_glob(
+    timerange,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+
+def test_from_files_with_derived_glob_differing_timerange(
+    rlut_file_future,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr", timerange="1980/2000")
+    expected.session = session
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "*"},
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "timerange": "*"},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
+
+
+def test_from_files_with_derived_no_force_derivation(
+    lwcre_file,
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_required_dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        frequency="mon",
+        long_name="TOA Longwave Cloud Radiative Effect",
+        modeling_realm=["atmos"],
+        original_short_name="lwcre",
+        standard_name="",
+        units="W m-2",
+    )
+    expected_required_dataset.supplementaries = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    expected_required_dataset.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == [expected_required_dataset]
+    assert required_datasets[0].files == [lwcre_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_no_force_derivation_glob(  # noqa: PLR0913
+    timerange,
+    lwcre_file,
+    lwcre_file_ground,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+        ),
+        Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True),
+    ]
+    for expected_ds in expected_datasets:
+        expected_ds.add_supplementary(short_name="pr", type="sat")
+        expected_ds.session = session
+
+    assert datasets == expected_datasets
+    assert datasets[0].files == [lwcre_file_ground]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+    assert datasets[1].files == [lwcre_file]
+    assert datasets[1].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "ground"},
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="lwcre",
+            derive=True,
+            frequency="mon",
+            long_name="TOA Longwave Cloud Radiative Effect",
+            modeling_realm=["atmos"],
+            original_short_name="lwcre",
+            standard_name="",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.supplementaries = [
+            Dataset(
+                **OBS6_SAT_FACETS,
+                short_name="pr",
+                derive=False,
+                frequency="mon",
+                long_name="Precipitation",
+                modeling_realm=["atmos"],
+                original_short_name="pr",
+                standard_name="precipitation_flux",
+                units="kg m-2 s-1",
+            ),
+        ]
+        expected_ds.session = session
+
+    for dataset, expected in zip(
+        datasets,
+        expected_required_datasets,
+        strict=True,
+    ):
+        assert dataset.required_datasets == [expected]
+    assert datasets[0].required_datasets[0].files == [lwcre_file_ground]
+    assert datasets[1].required_datasets[0].files == [lwcre_file]
+
+
+def test_from_files_with_derived_force_derivation(
+    lwcre_file,
+    rlut_file,
+    rlutcs_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_force_derivation_glob(  # noqa: PLR0913
+    timerange,
+    lwcre_file,
+    lwcre_file_ground,
+    rlut_file,
+    rlut_file_ground,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="lwcre",
+        derive=True,
+        force_derivation=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == [lwcre_file]
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            force_derivation=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == [rlut_file]
+    assert required_datasets[1].files == [rlutcs_file]
+
+
+class DerivedVariable(DerivedVariableBase):
+    """Derivation of dummy variable."""
+
+    @staticmethod
+    def required(project):
+        """Declare the variables needed for derivation."""
+        return [
+            {"short_name": "rlut", "optional": True},
+            {"short_name": "rlutcs"},
+            {"short_name": "pr"},
+        ]
+
+
+def test_from_files_with_derived_optional(
+    monkeypatch,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    monkeypatch.setattr(
+        esmvalcore.preprocessor._derive,
+        "ALL_DERIVED_VARIABLES",
+        {"tas": DerivedVariable},
+    )
+    dataset = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            optional=True,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == [rlutcs_file]
+    assert required_datasets[2].files == [pr_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_glob_optional(
+    timerange,
+    monkeypatch,
+    rlutcs_file,
+    pr_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    monkeypatch.setattr(
+        esmvalcore.preprocessor._derive,
+        "ALL_DERIVED_VARIABLES",
+        {"tas": DerivedVariable},
+    )
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="tas",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **OBS6_SAT_FACETS,
+        short_name="tas",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            optional=True,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == [rlutcs_file]
+    assert required_datasets[2].files == [pr_file]
+
+
+@pytest.mark.parametrize("timerange", ["1980/2000", "*"])
+def test_from_files_with_derived_glob_optional_missing(
+    timerange,
+    monkeypatch,
+    rlut_file,
+    session,
+):
+    """Test `from_files` with derived variable and supplementary."""
+    monkeypatch.setattr(
+        esmvalcore.preprocessor._derive,
+        "ALL_DERIVED_VARIABLES",
+        {"tas": DerivedVariable},
+    )
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="tas",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+        short_name="tas",
+        derive=True,
+    )
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+            short_name="rlut",
+            derive=False,
+            optional=True,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "type": "*", "timerange": timerange},
+            short_name="pr",
+            derive=False,
+            frequency="mon",
+            long_name="Precipitation",
+            modeling_realm=["atmos"],
+            original_short_name="pr",
+            standard_name="precipitation_flux",
+            units="kg m-2 s-1",
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == []
+    assert required_datasets[2].files == []
+
+
+def test_from_files_with_derived_only_optional(siconca_file, pr_file, session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "mip": "SImon"},
+        short_name="siextent",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr", mip="Amon")
+    dataset.session = session
+
+    datasets = list(dataset.from_files())
+
+    expected = Dataset(
+        **{**OBS6_SAT_FACETS, "mip": "SImon"},
+        short_name="siextent",
+        derive=True,
+    )
+    expected.add_supplementary(short_name="pr", mip="Amon")
+    expected.session = session
+
+    assert datasets == [expected]
+    assert datasets[0].files == []
+    assert datasets[0].supplementaries[0].files == [pr_file]
+
+    expected_required_datasets = [
+        Dataset(
+            **{**OBS6_SAT_FACETS, "mip": "SImon"},
+            short_name="sic",
+            derive=False,
+            frequency="mon",
+            long_name="Sea-Ice Area Percentage (Ocean Grid)",
+            modeling_realm=["seaIce"],
+            original_short_name="siconc",
+            standard_name="sea_ice_area_fraction",
+            units="%",
+            optional=True,
+        ),
+        Dataset(
+            **{**OBS6_SAT_FACETS, "mip": "SImon"},
+            short_name="siconca",
+            derive=False,
+            frequency="mon",
+            long_name="Sea-Ice Area Percentage (Atmospheric Grid)",
+            modeling_realm=["seaIce"],
+            original_short_name="siconca",
+            standard_name="sea_ice_area_fraction",
+            units="%",
+            optional=True,
+        ),
+    ]
+    for expected_ds in expected_required_datasets:
+        expected_ds.session = session
+
+    required_datasets = datasets[0].required_datasets
+    assert required_datasets == expected_required_datasets
+    assert required_datasets[0].files == []
+    assert required_datasets[1].files == [siconca_file]
+
+
+def test_from_files_with_derived_only_optional_glob_fail(session):
+    """Test `from_files` with derived variable and supplementary."""
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "mip": "SImon", "type": "*"},
+        short_name="siextent",
+        derive=True,
+    )
+    dataset.add_supplementary(short_name="pr", mip="Amon")
+    dataset.session = session
+
+    msg = r"Using wildcards to derive .* is not possible"
+    with pytest.raises(RecipeError, match=msg):
+        next(dataset.from_files())
+
+
+def test_match():
+    dataset1 = Dataset(
+        short_name="areacella",
+        ensemble=["r1i1p1f1"],
+        exp="historical",
+        modeling_realm=["atmos", "land"],
+    )
+    dataset2 = Dataset(
+        short_name="tas",
+        ensemble="r1i1p1f1",
+        exp=["historical", "ssp585"],
+        modeling_realm=["atmos"],
+    )
+
+    score = dataset1._match(dataset2)
+    assert score == 3
+
+
+def test_remove_duplicate_supplementaries():
+    dataset = Dataset(
+        dataset="dataset1",
+        short_name="tas",
+        mip="Amon",
+        project="CMIP6",
+        exp="historical",
+    )
+    supplementary1 = dataset.copy(short_name="areacella")
+    supplementary2 = supplementary1.copy()
+    supplementary1.facets["exp"] = "1pctCO2"
+    dataset.supplementaries = [supplementary1, supplementary2]
+
+    dataset._remove_duplicate_supplementaries()
+
+    assert len(dataset.supplementaries) == 1
+    assert dataset.supplementaries[0] == supplementary2
+
+
+def test_remove_not_found_supplementaries():
+    dataset = Dataset(
+        dataset="dataset1",
+        short_name="tas",
+        mip="Amon",
+        project="CMIP6",
+        exp="historical",
+    )
+    dataset.add_supplementary(short_name="areacella", mip="fx", exp="*")
+    dataset._remove_unexpanded_supplementaries()
+
+    assert len(dataset.supplementaries) == 0
+
+
+def test_concatenating_historical_and_future_exps(mocker):
+    mocker.patch.object(Dataset, "files", True)
+    dataset = Dataset(
+        dataset="dataset1",
+        short_name="tas",
+        mip="Amon",
+        frequency="mon",
+        project="CMIP6",
+        exp=["historical", "ssp585"],
+    )
+    dataset.add_supplementary(short_name="areacella", mip="fx", frequency="fx")
+    dataset._fix_fx_exp()
+
+    assert len(dataset.supplementaries) == 1
+    assert dataset.facets["exp"] == ["historical", "ssp585"]
+    assert dataset.supplementaries[0].facets["exp"] == "historical"
+
+
+def test_from_recipe_with_glob(tmp_path: Path, session: Session) -> None:
+    recipe_txt = textwrap.dedent("""
+
+    diagnostics:
+      diagnostic1:
+        variables:
+          tas:
+            project: CMIP5
+            mip: Amon
+            exp: rcp85
+            ensemble: r1i1p1
+            additional_datasets:
+              - {dataset: '*', institute: '*'}
+    """)
+    recipe = tmp_path / "recipe_test.yml"
+    recipe.write_text(recipe_txt, encoding="utf-8")
+
+    filenames = [
+        "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/"
+        "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc",
+        "cmip5/output1/NIMR-KMA/HadGEM2-AO/rcp85/mon/atmos/Amon/r1i1p1/"
+        "v20130815/tas_Amon_HadGEM2-AO_rcp85_r1i1p1_186001-200512.nc",
+    ]
+    for filename in filenames:
+        path = tmp_path / filename
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text("")
+
+    definitions: list[Facets] = [
+        {
+            "diagnostic": "diagnostic1",
+            "variable_group": "tas",
+            "dataset": "CSIRO-Mk3-6-0",
+            "project": "CMIP5",
+            "mip": "Amon",
+            "short_name": "tas",
+            "alias": "CSIRO-Mk3-6-0",
+            "recipe_dataset_index": 0,
+            "exp": "rcp85",
+            "ensemble": "r1i1p1",
+            "institute": "CSIRO-QCCCE",
+        },
+        {
+            "diagnostic": "diagnostic1",
+            "variable_group": "tas",
+            "dataset": "HadGEM2-AO",
+            "project": "CMIP5",
+            "mip": "Amon",
+            "short_name": "tas",
+            "alias": "HadGEM2-AO",
+            "recipe_dataset_index": 1,
+            "exp": "rcp85",
+            "ensemble": "r1i1p1",
+            "institute": "NIMR-KMA",
+        },
+    ]
+    expected = []
+    for facets in definitions:
+        dataset = Dataset(**facets)
+        dataset.session = session
+        expected.append(dataset)
+
+    datasets = Dataset.from_recipe(recipe, session)
+    print("Expected:", expected)
+    print("Got:", datasets)
+    assert all(ds.session == session for ds in datasets)
+    assert datasets == expected
+
+
+def test_from_ranges():
+    dataset = Dataset(ensemble="r(1:2)i1p1f1")
+    expected = [
+        Dataset(ensemble="r1i1p1f1"),
+        Dataset(ensemble="r2i1p1f1"),
+    ]
+    assert dataset.from_ranges() == expected
+
+
+def test_expand_ensemble():
+    dataset = Dataset(ensemble="r(1:2)i(2:3)p(3:4)")
+
+    expanded = dataset._expand_range("ensemble")
+
+    ensembles = [
+        "r1i2p3",
+        "r1i2p4",
+        "r1i3p3",
+        "r1i3p4",
+        "r2i2p3",
+        "r2i2p4",
+        "r2i3p3",
+        "r2i3p4",
+    ]
+    assert expanded == ensembles
+
+
+def test_expand_subexperiment():
+    dataset = Dataset(sub_experiment="s(1998:2005)")
+
+    expanded = dataset._expand_range("sub_experiment")
+
+    subexperiments = [
+        "s1998",
+        "s1999",
+        "s2000",
+        "s2001",
+        "s2002",
+        "s2003",
+        "s2004",
+        "s2005",
+    ]
+
+    assert expanded == subexperiments
+
+
+def test_expand_ensemble_list_ok():
+    dataset = Dataset(ensemble=["r0i0p0", "r1i1p1"])
+
+    expected = [["r0i0p0", "r1i1p1"]]
+
+    assert dataset._expand_range("ensemble") == expected
+
+
+def test_expand_ensemble_nolist():
+    dataset = Dataset(
+        dataset="XYZ",
+        ensemble=["r1i1p1", "r(1:2)i1p1"],
+    )
+
+    with pytest.raises(RecipeError):
+        dataset._expand_range("ensemble")
+
+
+def create_esgf_file(timerange, version):
+    """Prepare some fake ESGF search results."""
+    json = {
         "dataset_id": "CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3.historical"
         f".r1i1p1f1.Amon.tas.gr.{version}|esgf-data1.llnl.gov",
         "dataset_id_template_": [
@@ -1766,7 +2817,7 @@ def test_find_files_outdated_local(
     assert dataset.files == expected
 
 
-def test_set_version():
+def test_set_version_non_derived_var():
     dataset = Dataset(short_name="tas")
     dataset.add_supplementary(short_name="areacella")
     file_v1 = esmvalcore.io.local.LocalFile("/path/to/v1/tas.nc")
@@ -1782,6 +2833,47 @@ def test_set_version():
     assert dataset.supplementaries[0].facets["version"] == "v3"
 
 
+def test_set_version_derived_var(monkeypatch, session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="areacella")
+    dataset.files = []
+    areacella_file = esmvalcore.local.LocalFile("/path/to/areacella.nc")
+    areacella_file.facets["version"] = "v4"
+    dataset.supplementaries[0].files = [areacella_file]
+
+    def _get_required_datasets():
+        rlut_file = esmvalcore.local.LocalFile("/path/to/rlut.nc")
+        rlut_file.facets["version"] = "v1"
+        rlut_dataset = Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+        )
+        rlut_dataset.files = [rlut_file]
+        rlutcs_file_1 = esmvalcore.local.LocalFile("/path/to/rlutcs_1.nc")
+        rlutcs_file_2 = esmvalcore.local.LocalFile("/path/to/rlutcs_2.nc")
+        rlutcs_file_1.facets["version"] = "v2"
+        rlutcs_file_2.facets["version"] = "v3"
+        rlutcs_dataset = Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+        )
+        rlutcs_dataset.files = [rlutcs_file_1, rlutcs_file_2]
+        return [rlut_dataset, rlutcs_dataset]
+
+    monkeypatch.setattr(
+        dataset,
+        "_get_required_datasets",
+        _get_required_datasets,
+    )
+
+    dataset.set_version()
+
+    assert dataset.facets["version"] == ["v1", "v2", "v3"]
+    assert dataset.supplementaries[0].facets["version"] == "v4"
+
+
 @pytest.mark.parametrize("timerange", ["*", "185001/*", "*/185112"])
 def test_update_timerange_from_esgf(mocker, timerange):
     esgf_files = [
@@ -1853,9 +2945,8 @@ def test_update_timerange_no_files(session, search_data):
     }
     dataset = Dataset(**variable)
     dataset.files = []
-    msg = r"Missing data for Dataset: tas, Amon, CMIP6, HadGEM3-GC31-LL.*"
-    with pytest.raises(InputFilesNotFound, match=msg):
-        dataset._update_timerange()
+    dataset._update_timerange()
+    assert "timerange" not in dataset.facets
 
 
 def test_update_timerange_typeerror():
@@ -2296,16 +3387,6 @@ def test_get_extra_facets_native6():
     }
 
 
-OBS6_SAT_FACETS: Facets = {
-    "project": "OBS6",
-    "dataset": "SAT",
-    "mip": "Amon",
-    "tier": 2,
-    "type": "sat",
-    "timerange": "1980/2000",
-}
-
-
 def test_is_derived_no_derivation():
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
     assert dataset._is_derived() is False
@@ -2358,6 +3439,15 @@ def test_derivation_necessary_no_force_derivation_no_files(
     assert dataset._derivation_necessary() is True
 
 
+def test_derivation_necessary_no_force_derivation_no_files_glob(session):
+    dataset = Dataset(
+        **{**OBS6_SAT_FACETS, "timerange": "*"},
+        short_name="lwcre",
+        derive=True,
+    )
+    assert dataset._derivation_necessary() is True
+
+
 def test_derivation_necessary_no_force_derivation(tmp_path, session):
     dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
     dataset.session = session
@@ -2431,3 +3521,67 @@ def test_add_derived_supplementary_to_derived():
         force_derivation=True,
     )
     assert dataset.supplementaries[0] == expected_supplementary
+
+
+def test_required_datasets_derivation(session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+
+    expected_datasets = [
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlut",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlut",
+            standard_name="toa_outgoing_longwave_flux",
+            units="W m-2",
+        ),
+        Dataset(
+            **OBS6_SAT_FACETS,
+            short_name="rlutcs",
+            derive=False,
+            frequency="mon",
+            long_name="TOA Outgoing Clear-Sky Longwave Radiation",
+            modeling_realm=["atmos"],
+            original_short_name="rlutcs",
+            standard_name="toa_outgoing_longwave_flux_assuming_clear_sky",
+            units="W m-2",
+        ),
+    ]
+    for expected_dataset in expected_datasets:
+        expected_dataset.session = dataset.session
+
+    assert dataset.required_datasets == expected_datasets
+
+
+def test_required_datasets_no_derivation():
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas")
+    dataset.add_supplementary(short_name="pr")
+
+    assert dataset.required_datasets == [dataset]
+
+
+def test_required_datasets_no_force_derivation(tmp_path, session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True)
+    dataset.add_supplementary(short_name="pr")
+    dataset.session = session
+
+    input_dir = tmp_path / "Tier2" / "SAT"
+    input_dir.mkdir(parents=True, exist_ok=True)
+    lwcre_file = esmvalcore.local.LocalFile(
+        input_dir / "OBS6_SAT_sat_1_Amon_lwcre_1980-2000.nc",
+    )
+    lwcre_file.touch()
+
+    assert dataset.required_datasets == [dataset]
+
+
+def test_required_datasets_no_derivation_available(session):
+    dataset = Dataset(**OBS6_SAT_FACETS, short_name="tas", derive=True)
+
+    msg = r"Cannot derive variable 'tas': no derivation script available"
+    with pytest.raises(NotImplementedError, match=msg):
+        dataset.required_datasets  # noqa: B018