Skip to content

Commit 0dc71a1

Browse files
committed
auto convert pandas to spark dataframe
Signed-off-by: Sajid Alam <[email protected]>
1 parent 4a03e4a commit 0dc71a1

File tree

2 files changed

+16
-0
lines changed

2 files changed

+16
-0
lines changed

kedro-datasets/kedro_datasets/spark/spark_dataset_v2.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,8 @@ def _save(self, data: DataFrame | pd.DataFrame) -> None:
261261
data: PySpark DataFrame or Pandas DataFrame to save.
262262
Pandas DataFrames will be automatically converted to Spark.
263263
"""
264+
import pandas as pd # noqa: PLC0415
265+
264266
spark_session = get_spark_with_remote_support()
265267

266268
# Convert Pandas DataFrame to Spark DataFrame if needed

kedro-datasets/tests/spark/test_spark_dataset_v2.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,20 @@ def test_relative_path(self, sample_spark_df):
208208
finally:
209209
os.chdir(original_cwd)
210210

211+
def test_save_pandas_dataframe(self, tmp_path, sample_pandas_df):
212+
"""Test saving a Pandas DataFrame directly (auto-converts to Spark)."""
213+
filepath = str(tmp_path / "test_pandas.parquet")
214+
dataset = SparkDatasetV2(filepath=filepath)
215+
216+
# Save Pandas DataFrame directly
217+
dataset.save(sample_pandas_df)
218+
assert Path(filepath).exists()
219+
220+
# Load and verify
221+
loaded_df = dataset.load()
222+
assert loaded_df.count() == len(sample_pandas_df)
223+
assert set(loaded_df.columns) == set(sample_pandas_df.columns)
224+
211225

212226
class TestSparkDatasetV2Schema:
213227
"""Test schema handling in SparkDatasetV2."""

0 commit comments

Comments
 (0)