-
Notifications
You must be signed in to change notification settings - Fork 1
Add Pystp netcdf driver #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9cdb298
89d5e2d
72c70a0
f090582
13000cc
598114e
ec4e33c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -94,12 +94,14 @@ def _load_data_var(master_cdf: Driver, cdf: Driver, var: str) -> DataVariable or | |
| class ISTPLoaderImpl: | ||
| cdf: Optional[Driver] = None | ||
|
|
||
| def __init__(self, file=None, buffer=None, master_file=None, master_buffer=None): | ||
| def __init__(self, file=None, buffer=None, master_file=None, master_buffer=None, driver_factory=None): | ||
| if driver_factory is None: | ||
| driver_factory = current_driver | ||
| if file is not None: | ||
| log.debug(f"Loading {file}") | ||
| self.cdf = current_driver(file or buffer) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned in another comment, I would like to take into account the fact that the driver used to read the master file may differ from the one used to read the data file. |
||
| self.cdf = driver_factory(file or buffer) | ||
| if master_file or master_buffer: | ||
| self.master_cdf = current_driver(master_file or master_buffer) | ||
| self.master_cdf = driver_factory(master_file or master_buffer) | ||
| else: | ||
| self.master_cdf = self.cdf | ||
| self.data_variables = [] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| import netCDF4 | ||
| import numpy as np | ||
| from typing import Any | ||
|
|
||
|
|
||
| class Driver: | ||
| """NetCDF4 driver implementing the PyISTP Driver protocol.""" | ||
|
|
||
| def __init__(self, file): | ||
| # Accept either a file path (str) or a bytes buffer | ||
| if isinstance(file, bytes): | ||
| self._ds = netCDF4.Dataset("in_memory.nc", memory=file) | ||
| else: | ||
| self._ds = netCDF4.Dataset(str(file), "r") | ||
|
|
||
| def variables(self): | ||
| return list(self._ds.variables.keys()) | ||
|
|
||
| def has_variable(self, name): | ||
| return name in self._ds.variables | ||
|
|
||
| def variable_attributes(self, var): | ||
| if var not in self._ds.variables: | ||
| return [] | ||
| return list(self._ds[var].ncattrs()) | ||
|
|
||
| def variable_attribute_value(self, var, attr): | ||
| if var not in self._ds.variables: | ||
| return None | ||
| try: | ||
| return self._ds[var].getncattr(attr) | ||
| except AttributeError: | ||
| return None | ||
|
|
||
| def is_char(self, var): | ||
| if var not in self._ds.variables: | ||
| return False | ||
| return self._ds[var].dtype == str | ||
|
|
||
| def is_nrv(self, var): # NOSONAR | ||
| # NRV concept does not exist in NetCDF4 | ||
| return False | ||
|
|
||
| def shape(self, var): | ||
| return tuple(self._ds[var].shape) | ||
|
|
||
| def attributes(self): | ||
| return list(self._ds.ncattrs()) | ||
|
|
||
| def attribute(self, key): | ||
| try: | ||
| return self._ds.getncattr(key) | ||
| except AttributeError: | ||
| return None | ||
|
|
||
| # Mapping from numpy dtype kinds to CDF type strings | ||
| _DTYPE_TO_CDF = { | ||
| 'f4': 'CDF_FLOAT', | ||
| 'f8': 'CDF_DOUBLE', | ||
| 'i1': 'CDF_INT1', | ||
| 'i2': 'CDF_INT2', | ||
| 'i4': 'CDF_INT4', | ||
| 'i8': 'CDF_INT8', | ||
| 'u1': 'CDF_UINT1', | ||
| 'u2': 'CDF_UINT2', | ||
| 'u4': 'CDF_UINT4', | ||
| 'S': 'CDF_CHAR', | ||
| } | ||
|
|
||
| # Milliseconds between CDF epoch (year 0000) and Unix epoch (1970-01-01) | ||
| _CDF_EPOCH_OFFSET_MS = 62_167_219_200_000 | ||
|
|
||
| def _get_units(self, var): | ||
| """Return the units attribute, checking 'units' and 'UNITS'.""" | ||
| v = self._ds[var] | ||
| for key in ('units', 'UNITS'): | ||
| try: | ||
| return v.getncattr(key) | ||
| except AttributeError: | ||
| pass | ||
| return None | ||
|
|
||
| def _is_cf_time(self, var): | ||
| """Return True if the variable uses CF time conventions | ||
| (float + units attribute containing 'since').""" | ||
| units = self._get_units(var) | ||
| return isinstance(units, str) and 'since' in units | ||
|
|
||
| def _is_cdf_epoch(self, var): | ||
| """Return True if the variable uses CDF_EPOCH convention | ||
| (float64, units='ms').""" | ||
| units = self._get_units(var) | ||
| return (isinstance(units, str) | ||
| and units.strip().lower() == 'ms' | ||
| and self._ds[var].dtype == np.float64) | ||
|
|
||
| def _cf_time_to_datetime64(self, var): | ||
| """Convert a CF time variable (units with 'since') to | ||
| datetime64[ns].""" | ||
| v = self._ds[var] | ||
| units = v.getncattr('units') | ||
| # netCDF4.num2date converts CF floats to cftime objects | ||
| dates: Any = netCDF4.num2date( | ||
| v[:], units, only_use_cftime_datetimes=False | ||
| ) | ||
| # Convert to datetime64[ns] via ISO string representation | ||
| return np.array([np.datetime64(str(d), 'ns') for d in dates]) | ||
|
|
||
| def _cdf_epoch_to_datetime64(self, var): | ||
| """Convert CDF_EPOCH (ms since year 0000) to datetime64[ns].""" | ||
| ms = np.array(self._ds[var][:], dtype=np.float64) | ||
| unix_ms = ms - self._CDF_EPOCH_OFFSET_MS | ||
| return (unix_ms * 1_000_000).astype('datetime64[ns]') | ||
|
|
||
| def values(self, var, is_metadata_variable=False): # NOSONAR | ||
| v = self._ds[var] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my opinion, it is not the responsibility of the pyistp driver to interpret the data and convert it into datetime64; this should instead be handled by the consuming tool (in our case, the Speasy codec).
If we move this interpretation logic into Speasy, we will be able to adapt it more easily depending on the provider. |
||
| if self._is_cf_time(var): | ||
| return self._cf_time_to_datetime64(var) | ||
| if self._is_cdf_epoch(var): | ||
| return self._cdf_epoch_to_datetime64(var) | ||
| if v.dtype == str: | ||
| # Native NetCDF4 string — return as numpy array of strings | ||
| raw = v[()] | ||
| if isinstance(raw, str): | ||
| raw = [raw] | ||
| return np.array(raw) | ||
| return np.array(v[:]) | ||
|
|
||
| def cdf_type(self, var): | ||
| v = self._ds[var] | ||
| # CF time variable: float with a "units" attribute containing "since" | ||
| try: | ||
| units = v.getncattr('units') | ||
| if isinstance(units, str) and 'since' in units: | ||
| return 'CDF_TIME_TT2000' | ||
| except AttributeError: | ||
| pass | ||
| if v.dtype == str: | ||
| return 'CDF_CHAR' | ||
| dtype_str = v.dtype.str.lstrip('<>=!') | ||
| return self._DTYPE_TO_CDF.get(dtype_str, f'CDF_UNKNOWN_{dtype_str}') | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,9 @@ classifiers = [ | |
| "Programming Language :: Python :: 3.12", | ||
| ] | ||
| dependencies = ['pycdfpp>=0.6.0'] | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why make this dependency optional? |
||
| [project.optional-dependencies] | ||
| netcdf = ['netCDF4'] | ||
| [project.urls] | ||
| homepage = "https://github.com/SciQLop/PyISTP" | ||
| repository = "https://github.com/SciQLop/PyISTP" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,3 +8,5 @@ coverage | |
| Sphinx | ||
| twine | ||
| ddt | ||
| netCDF4 | ||
| pytest-cov | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The distinction between the load (for CDF) and load_netcdf methods based on the file format bothers me.
I would have preferred to keep a single load method.
The selection of the driver to use could be handled in the constructor of ISTPLoaderImpl (as is already partially done for the CDF driver type: pycdfpp or spacepy).
The format detection could be implemented by inspecting the first 4 bytes that define the magic number, for example.
Additionally, in load_netcdf, we lose the ability to provide a master file, which I find problematic (see examples below).
I would also like the reading of the data file and the reading of the master file to be independent, meaning they could potentially use different drivers.
Example 1 – ICON mission from CDAWeb:
The master file is provided in CDF: https://cdaweb.gsfc.nasa.gov/pub/software/cdawlib/0MASTERS/icon_l2-6_euv_00000000_v01.cdf
The data files are in netCDF: https://spdf.gsfc.nasa.gov/pub/data/icon/l2/l2-6_euv/
The data files look like ISTP-compliant files, but they are not actually compliant.
For example, in the netCDF data files, Var_Type is used to specify whether a variable is data or support_data.
However, the specification (https://github.com/IHDE-Alliance/ISTP_metadata/blob/main/ISTP_metadata_guidelines/docs/05_metadata-variable-attributes.md#istp-variable-attributes) clearly states: "Note that attribute names are case sensitive, and the names of the ISTP variable attributes must match the case as shown."
Therefore, VAR_TYPE should have been used for the netCDF files to be directly ISTP-compliant.
The master file, on the other hand, is properly ISTP-compliant and does use VAR_TYPE to define the data type.
Example 2 - AMDA:
For AMDA, we are considering decommissioning our DDSERVER data server and replacing it with Speasy.
The data in this database is in netCDF and is not ISTP-compliant.
Regenerating the entire database is not an option (several million files and multiple terabytes in volume).
What I would like to do instead is generate CDF/ISTP-compliant master files for each dataset.
This would put us in a situation similar to the ICON mission from CDAWeb:
This would avoid the need for AMDA-specific development, which would be ideal.