Skip to content

Commit d9bd3a7

Browse files
SRIKAR-8-77Sreekar Reddyankatiyarmerelcht
authored
fix: update DeltaTableDataset.py updating the from data catalog metho… (#1202)
* fix: update DeltaTableDataset.py updating the from data catalog method for deltalake v1.0.0 Signed-off-by: SRIKAR-8-77 <[email protected]> * testing Signed-off-by: SRIKAR-8-77 <[email protected]> * Fix-Signing_off_DCO Signed-off-by: SRIKAR-8-77 <[email protected]> * Unpin deltalake Signed-off-by: Ankita Katiyar <[email protected]> * lint Signed-off-by: Ankita Katiyar <[email protected]> * Tests Signed-off-by: Ankita Katiyar <[email protected]> * Update release notes Signed-off-by: Ankita Katiyar <[email protected]> * Fix release notes Signed-off-by: Merel Theisen <[email protected]> --------- Signed-off-by: SRIKAR-8-77 <[email protected]> Signed-off-by: Ankita Katiyar <[email protected]> Signed-off-by: Merel Theisen <[email protected]> Signed-off-by: Merel Theisen <[email protected]> Co-authored-by: Sreekar Reddy <[email protected]> Co-authored-by: Ankita Katiyar <[email protected]> Co-authored-by: Ankita Katiyar <[email protected]> Co-authored-by: Merel Theisen <[email protected]> Co-authored-by: Merel Theisen <[email protected]>
1 parent d0309be commit d9bd3a7

File tree

4 files changed

+36
-27
lines changed

4 files changed

+36
-27
lines changed

kedro-datasets/RELEASE.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@
1515
| `chromadb.ChromaDBDataset` | A dataset for loading and saving data to ChromaDB vector database collections | `kedro_datasets_experimental.chromadb` |
1616

1717
## Bug fixes and other changes
18+
- Updated `pandas.DeltaTableDataset` to be compatible with `deltalake` version 1.x.
19+
1820
## Community contributions
1921

2022
- [Armand Masseau](https://github.com/armandmasseaugit)
23+
- [SRIKAR-8-77](https://github.com/SRIKAR-8-77)
2124

2225
# Release 9.0.0
2326

@@ -64,7 +67,6 @@
6467
Many thanks to the following Kedroids for contributing PRs to this release:
6568
- [Guillaume Tauzin](https://github.com/gtauzin)
6669
- [gitgud5000](https://github.com/gitgud5000)
67-
- [Armand Masseau](https://github.com/armandmasseaugit)
6870

6971
# Release 8.1.0
7072

kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
S3, GCS), Databricks unity catalog and AWS Glue catalog respectively. It handles
33
load and save using a pandas dataframe.
44
"""
5+
56
from __future__ import annotations
67

78
from copy import deepcopy
89
from typing import Any
910

1011
import pandas as pd
11-
from deltalake import DataCatalog, DeltaTable, Metadata
12+
from deltalake import DeltaTable, Metadata
1213
from deltalake.exceptions import TableNotFoundError
1314
from deltalake.writer import write_deltalake
1415
from kedro.io.core import AbstractDataset, DatasetError
@@ -84,7 +85,7 @@ def __init__( # noqa: PLR0913
8485
self,
8586
*,
8687
filepath: str | None = None,
87-
catalog_type: DataCatalog | None = None,
88+
catalog_type: str | None = None,
8889
catalog_name: str | None = None,
8990
database: str | None = None,
9091
table: str | None = None,
@@ -169,12 +170,20 @@ def __init__( # noqa: PLR0913
169170
)
170171
except TableNotFoundError:
171172
self.is_empty_dir = True
172-
else:
173-
self._delta_table = DeltaTable.from_data_catalog(
174-
data_catalog=DataCatalog[self._catalog_type], # type: ignore[misc]
175-
data_catalog_id=self._catalog_name,
176-
database_name=self._database or "",
177-
table_name=self._table or "",
173+
elif self._catalog_type:
174+
if self._catalog_type.upper() == "AWS":
175+
table_uri = f"glue:///{self._database}/{self._table}"
176+
elif self._catalog_type.upper() == "UNITY":
177+
table_uri = (
178+
f"unity://{self._catalog_name}/{self._database}/{self._table}"
179+
)
180+
else:
181+
raise ValueError(f"Unsupported catalog type: {self._catalog_type}")
182+
183+
self._delta_table = DeltaTable(
184+
table_uri=table_uri,
185+
storage_options=self.fs_args,
186+
version=self._version,
178187
)
179188

180189
@property

kedro-datasets/pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ optuna-studydataset = ["optuna"]
134134
optuna = ["kedro-datasets[optuna-studydataset]"]
135135

136136
pandas-csvdataset = ["kedro-datasets[pandas-base]"]
137-
pandas-deltatabledataset = ["kedro-datasets[pandas-base]", "deltalake>=0.10.0, <1.0.0"]
137+
pandas-deltatabledataset = ["kedro-datasets[pandas-base]", "deltalake>=0.10.0"]
138138
pandas-exceldataset = ["kedro-datasets[pandas-base]", "openpyxl>=3.0.6, <4.0"]
139139
pandas-featherdataset = ["kedro-datasets[pandas-base]"]
140140
pandas-gbqtabledataset = ["kedro-datasets[pandas-base]", "pandas-gbq>=0.12.0"]
@@ -174,8 +174,8 @@ plotly-plotlydataset = ["kedro-datasets[pandas-base,plotly-base]"]
174174
plotly = ["kedro-datasets[plotly-htmldataset,plotly-jsondataset,plotly-plotlydataset]"]
175175

176176
polars-csvdataset = ["kedro-datasets[polars-base]"]
177-
polars-eagerpolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2, <1.0.0"]
178-
polars-lazypolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "deltalake >= 0.6.2, <1.0.0"]
177+
polars-eagerpolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "xlsx2csv>=0.8.0", "deltalake >= 0.6.2"]
178+
polars-lazypolarsdataset = ["kedro-datasets[polars-base]", "pyarrow>=4.0", "deltalake >= 0.6.2"]
179179
polars = [
180180
"""kedro-datasets[polars-csvdataset,\
181181
polars-eagerpolarsdataset,\
@@ -263,9 +263,9 @@ test = [
263263
"compress-pickle[lz4]~=2.1.0",
264264
"coverage>=7.2.0",
265265
"dask[complete]>=2021.10",
266+
"deltalake>=0.10.0",
266267
"delta-spark>=1.0, <3.0; python_version <= '3.11'",
267268
"delta-spark>=4.0; python_version >= '3.12'",
268-
"deltalake>=0.10.0, <1.0.0",
269269
"dill~=0.3.1",
270270
"filelock>=3.4.0, <4.0",
271271
"fiona >=1.8, <2.0",

kedro-datasets/tests/pandas/test_deltatable_dataset.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pandas as pd
44
import pytest
5-
from deltalake import DataCatalog, Metadata
5+
from deltalake import Metadata
66
from kedro.io.core import DatasetError
77
from pandas.testing import assert_frame_equal
88

@@ -121,12 +121,11 @@ def test_from_aws_glue_catalog(self, mocker):
121121
"kedro_datasets.pandas.deltatable_dataset.DeltaTable"
122122
)
123123
_ = DeltaTableDataset(catalog_type="AWS", database="db", table="tbl")
124-
mock_delta_table.from_data_catalog.assert_called_once()
125-
mock_delta_table.from_data_catalog.assert_called_with(
126-
data_catalog=DataCatalog.AWS,
127-
data_catalog_id=None,
128-
database_name="db",
129-
table_name="tbl",
124+
mock_delta_table.assert_called_once()
125+
mock_delta_table.assert_called_with(
126+
table_uri="glue:///db/tbl",
127+
storage_options={},
128+
version=None,
130129
)
131130

132131
def test_from_databricks_unity_catalog(self, mocker):
@@ -137,17 +136,16 @@ def test_from_databricks_unity_catalog(self, mocker):
137136
_ = DeltaTableDataset(
138137
catalog_type="UNITY", catalog_name="id", database="db", table="tbl"
139138
)
140-
mock_delta_table.from_data_catalog.assert_called_once()
141-
mock_delta_table.from_data_catalog.assert_called_with(
142-
data_catalog=DataCatalog.UNITY,
143-
data_catalog_id="id",
144-
database_name="db",
145-
table_name="tbl",
139+
mock_delta_table.assert_called_once()
140+
mock_delta_table.assert_called_with(
141+
table_uri="unity://id/db/tbl",
142+
storage_options={},
143+
version=None,
146144
)
147145

148146
def test_from_unsupported_catalog(self):
149147
"""Test dataset creation from unsupported catalog."""
150-
with pytest.raises(KeyError):
148+
with pytest.raises(ValueError):
151149
DeltaTableDataset(catalog_type="unsupported", database="db", table="tbl")
152150

153151
def test_unsupported_write_mode(self, filepath):

0 commit comments

Comments
 (0)