Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ from the examples given in that link.

## ref-sample-data 0.1.1 (2025-01-08)

No significant changes.
### Bug Fixes

- Correct the location of the datasets within the repository ([#1](https://github.com/CMIP-REF/ref-sample-data/pulls/1))


## ref-sample-data 0.1.0 (2025-01-08)
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ virtual-environment: ## update virtual environment, create a new one if it does

.PHONY: fetch-test-data
fetch-test-data: ## Fetch test data
rm -rf data
uv run python ./scripts/fetch_test_data.py

registry.txt: data ## Generate a registry of all the packages
Expand Down
1 change: 0 additions & 1 deletion changelog/1.bug.md

This file was deleted.

2 changes: 2 additions & 0 deletions changelog/2.breaking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Use the dataset version's from ESGF instead of the values in the netCDF files.
Different files in the same dataset may contain different versions inside their netCDF files.
Binary file not shown.
Binary file not shown.
24 changes: 13 additions & 11 deletions registry.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 95341df80de95ddb0b45da11aed67db771414fff94508687fb30fce63b82c104
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 25e9e817a05ffab4a2b073078f6be0e52096fa9da8eb55f009d079842c708614
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 139c4c59d98c737ce2d7ca777e52e35e38d49fcb8b08dd98175ed0f1354f8e75
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc be4a191c75b3643aad34238970c0587128a3852694f2c61425b4bbda42e5ff08
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rlut/gn/v20210318/rlut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 1083d92079e9c40d3797ecc4235df1c86c99af7ca3b9458b21f1d34054351041
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 9f9ae50efc55f4e18dc174d7c3af10f4e67a391c84d81cdb6ba574fa8b61b276
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 2299e10eb6ccf190fe07f7b60aa40b8700f7f964ca68c989f3572abe39eb22c7
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 5141bb64d6f457550d8bf429a4233af1bd706ed8b2131fc2ef329bcb6db7a236
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 49fbd6c0d7b8c0d10a270e8d88191764c02ba651f80b464605dfa5b0221d622b
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc bff52adef26d48d4b747368816aff3712c606cafa92f6b78f4974f23efcba510
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc b61faa48540472be5b208a4ecf40873860c1d4cfb7f50a4dff4ac17ee2ba4f73
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 95341df80de95ddb0b45da11aed67db771414fff94508687fb30fce63b82c104
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 25e9e817a05ffab4a2b073078f6be0e52096fa9da8eb55f009d079842c708614
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 139c4c59d98c737ce2d7ca777e52e35e38d49fcb8b08dd98175ed0f1354f8e75
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc be4a191c75b3643aad34238970c0587128a3852694f2c61425b4bbda42e5ff08
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/fx/areacella/gn/20191115/areacella_fx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc d9d07cacc65c196b9ec47d60cabcf86fd397b1e22063a32c3798a98ee3dfb16e
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rlut/gn/20210318/rlut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 1083d92079e9c40d3797ecc4235df1c86c99af7ca3b9458b21f1d34054351041
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 9f9ae50efc55f4e18dc174d7c3af10f4e67a391c84d81cdb6ba574fa8b61b276
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 2299e10eb6ccf190fe07f7b60aa40b8700f7f964ca68c989f3572abe39eb22c7
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 5141bb64d6f457550d8bf429a4233af1bd706ed8b2131fc2ef329bcb6db7a236
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 49fbd6c0d7b8c0d10a270e8d88191764c02ba651f80b464605dfa5b0221d622b
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc bff52adef26d48d4b747368816aff3712c606cafa92f6b78f4974f23efcba510
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc b61faa48540472be5b208a4ecf40873860c1d4cfb7f50a4dff4ac17ee2ba4f73
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc e8e3b873d9ba115974329c0f7785c9e30dcca66007fa973c22cd734efc46dcfd
59 changes: 36 additions & 23 deletions scripts/fetch_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
from pathlib import Path
from typing import Any

import pandas as pd
import xarray as xr
from intake_esgf import ESGFCatalog

OUTPUT_PATH = Path("data")


def fetch_datasets(search_facets: dict[str, Any], remove_ensembles: bool) -> list[Path]:
def fetch_datasets(search_facets: dict[str, Any], remove_ensembles: bool) -> pd.DataFrame:
"""
Fetch the datasets from ESGF.

Expand All @@ -37,13 +38,14 @@ def fetch_datasets(search_facets: dict[str, Any], remove_ensembles: bool) -> lis
if remove_ensembles:
cat.remove_ensembles()

path_dict = cat.to_path_dict(prefer_streaming=False)
path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False)

# Flatten list of lists into a single list
return [p for dataset_paths in path_dict.values() for p in dataset_paths]
merged_df = cat.df.merge(pd.Series(path_dict, name="files"), left_on="key", right_index=True)

return merged_df

def downscale_dataset(dataset: xr.Dataset) -> xr.Dataset:

def decimate_dataset(dataset: xr.Dataset) -> xr.Dataset:
"""
Downscale the dataset to a smaller size.

Expand All @@ -65,7 +67,7 @@ def downscale_dataset(dataset: xr.Dataset) -> xr.Dataset:
return spatial_downscale


def create_out_filename(ds: xr.Dataset) -> pathlib.Path:
def create_out_filename(metadata: pd.Series, ds: xr.Dataset) -> pathlib.Path:
"""
Create the output filename for the dataset.

Expand All @@ -80,11 +82,11 @@ def create_out_filename(ds: xr.Dataset) -> pathlib.Path:
"""
cmip6_path_items = [
"mip_era",
"activity_id",
"activity_drs",
"institution_id",
"source_id",
"experiment_id",
"variant_label",
"member_id",
"table_id",
"variable_id",
"grid_label",
Expand All @@ -96,43 +98,54 @@ def create_out_filename(ds: xr.Dataset) -> pathlib.Path:
"table_id",
"source_id",
"experiment_id",
"variant_label",
"member_id",
"grid_label",
]

output_path = Path(os.path.join(*[str(ds.attrs[item]) for item in cmip6_path_items]))
output_path = Path(os.path.join(*[metadata[item] for item in cmip6_path_items]))
filename_prefix = "_".join([metadata[item] for item in cmip6_filename_paths])

if "time" in ds.dims:
time_range = f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
filename = "_".join([str(ds.attrs[item]) for item in cmip6_filename_paths]) + f"_{time_range}.nc"
filename = f"{filename_prefix}_{time_range}.nc"
else:
filename = "_".join([str(ds.attrs[item]) for item in cmip6_filename_paths]) + ".nc"
filename = f"{filename_prefix}.nc"
return output_path / filename


if __name__ == "__main__":
datasets: list[Path] = []

facets_to_fetch = [
dict(
source_id="ACCESS-ESM1-5",
frequency="mon",
variable_id=["tas", "rsut", "rlut", "rsdt"],
experiment_id=["ssp119", "ssp126", "historical"],
experiment_id=["ssp126", "historical"],
remove_ensembles=True,
),
dict(
source_id="ACCESS-ESM1-5",
frequency="fx",
variable_id=["areacella"],
experiment_id=["ssp126", "historical"],
remove_ensembles=True,
),
]

dataset_metadata_collection: list[pd.DataFrame] = []
for facets in facets_to_fetch:
remove_ensembles = facets.pop("remove_ensembles", False)
datasets.extend(fetch_datasets(facets, remove_ensembles=remove_ensembles))

print(datasets)
for dataset_path in datasets:
ds_orig = xr.open_dataset(dataset_path)
dataset_metadata_collection.append(fetch_datasets(facets, remove_ensembles=remove_ensembles))

datasets = pd.concat(dataset_metadata_collection)

for _, dataset in datasets.iterrows():
print(dataset.key)
for ds_filename in dataset["files"]:
ds_orig = xr.open_dataset(ds_filename)

ds_downscaled = downscale_dataset(ds_orig)
ds_downscaled = decimate_dataset(ds_orig)

output_filename = OUTPUT_PATH / create_out_filename(ds_orig)
output_filename.parent.mkdir(parents=True, exist_ok=True)
ds_downscaled.to_netcdf(output_filename)
output_filename = OUTPUT_PATH / create_out_filename(dataset, ds_orig)
output_filename.parent.mkdir(parents=True, exist_ok=True)
ds_downscaled.to_netcdf(output_filename)
Loading