Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/data_designer/config/datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
from pathlib import Path
from typing import TYPE_CHECKING

import pandas as pd
import pyarrow.parquet as pq
from huggingface_hub import HfApi, HfFileSystem
from pydantic import BaseModel, Field

from data_designer import lazy_imports
from data_designer.config.errors import InvalidConfigError, InvalidFileFormatError, InvalidFilePathError
from data_designer.config.utils.io_helpers import VALID_DATASET_FILE_EXTENSIONS, validate_path_contains_files_of_type

Expand Down Expand Up @@ -46,7 +45,7 @@ def get_file_column_names(file_reference: str | Path | HfFileSystem, file_type:
"""
if file_type == "parquet":
try:
schema = pq.read_schema(file_reference)
schema = lazy_imports.pq.read_schema(file_reference)
if hasattr(schema, "names"):
return schema.names
else:
Expand All @@ -55,12 +54,12 @@ def get_file_column_names(file_reference: str | Path | HfFileSystem, file_type:
logger.warning(f"Failed to process parquet file {file_reference}: {e}")
return []
elif file_type in ["json", "jsonl"]:
return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
return lazy_imports.pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
elif file_type == "csv":
try:
df = pd.read_csv(file_reference, nrows=1)
df = lazy_imports.pd.read_csv(file_reference, nrows=1)
return df.columns.tolist()
except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
except (lazy_imports.pd.errors.EmptyDataError, lazy_imports.pd.errors.ParserError) as e:
logger.warning(f"Failed to process CSV file {file_reference}: {e}")
return []
else:
Expand Down
4 changes: 2 additions & 2 deletions src/data_designer/config/sampler_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from enum import Enum
from typing import Literal

import pandas as pd
from pydantic import Field, field_validator, model_validator
from typing_extensions import Self, TypeAlias

from data_designer import lazy_imports
from data_designer.config.base import ConfigBase
from data_designer.config.utils.constants import (
AVAILABLE_LOCALES,
Expand Down Expand Up @@ -113,7 +113,7 @@ class DatetimeSamplerParams(ConfigBase):
@classmethod
def _validate_param_is_datetime(cls, value: str) -> str:
try:
pd.to_datetime(value)
lazy_imports.pd.to_datetime(value)
except ValueError:
raise ValueError(f"Invalid datetime format: {value}")
return value
Expand Down
106 changes: 106 additions & 0 deletions src/data_designer/engine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,108 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Engine module with fully automatic lazy loading.

This module automatically discovers ALL engine modules and their public classes/functions,
providing a facade that lazily imports components only when accessed. This significantly
improves import performance while requiring ZERO maintenance - just add a module and it's
automatically exported.

Note: Private modules (starting with _) are excluded from auto-discovery.
"""

import ast
import importlib
from pathlib import Path


def _discover_all_engine_exports() -> dict[str, tuple[str, str]]:
"""Automatically discover all public classes/functions in the engine package.

Scans the engine directory recursively for all Python files, parses them
with AST (without importing), and builds a mapping of all public exports.

Returns:
Dictionary mapping public names to (module_path, attribute_name) tuples.
"""
lazy_imports = {}
engine_dir = Path(__file__).parent

# Find all Python files in engine directory recursively
for py_file in engine_dir.rglob("*.py"):
# Skip __init__.py files and private modules (starting with _)
if py_file.name.startswith("_"):
continue

# Convert file path to module path
# e.g., dataset_builders/column_wise_builder.py -> data_designer.engine.dataset_builders.column_wise_builder
rel_path = py_file.relative_to(engine_dir.parent)
module_parts = list(rel_path.parts[:-1]) + [rel_path.stem]
module_path = ".".join(["data_designer"] + module_parts)

try:
# Parse the Python file with AST (doesn't import it - fast!)
with open(py_file, "r", encoding="utf-8") as f:
tree = ast.parse(f.read(), filename=str(py_file))

# Find all top-level public classes and functions
for node in tree.body:
if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
name = node.name
# Only export public items (no leading underscore)
if not name.startswith("_"):
# Avoid name collisions - first one wins
if name not in lazy_imports:
lazy_imports[name] = (module_path, name)
except Exception:
# If AST parsing fails, skip this module silently
pass

return lazy_imports


# Cache for lazy imports - built on first access
_LAZY_IMPORTS_CACHE: dict[str, tuple[str, str]] | None = None


def __getattr__(name: str) -> object:
"""Lazily import engine components when accessed.

On first access, automatically discovers all public classes/functions in the
engine package. Subsequent accesses use the cached mapping for fast lookups.

Args:
name: The name of the attribute to import.

Returns:
The imported class, function, or object.

Raises:
AttributeError: If the attribute is not found in any engine module.
"""
global _LAZY_IMPORTS_CACHE

# Build cache on first access
if _LAZY_IMPORTS_CACHE is None:
_LAZY_IMPORTS_CACHE = _discover_all_engine_exports()

if name in _LAZY_IMPORTS_CACHE:
module_path, attr_name = _LAZY_IMPORTS_CACHE[name]
# Dynamically import the module
module = importlib.import_module(module_path)
# Get the attribute from the module
return getattr(module, attr_name)

raise AttributeError(f"module 'data_designer.engine' has no attribute {name!r}")


def __dir__() -> list[str]:
"""Return list of all available lazy imports for introspection."""
global _LAZY_IMPORTS_CACHE

# Build cache if not already built
if _LAZY_IMPORTS_CACHE is None:
_LAZY_IMPORTS_CACHE = _discover_all_engine_exports()

return list(_LAZY_IMPORTS_CACHE.keys())
58 changes: 23 additions & 35 deletions src/data_designer/interface/data_designer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import logging
from pathlib import Path

import pandas as pd

# Lazy-loaded third-party and engine components via facades
from data_designer import engine, lazy_imports
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.default_model_settings import (
Expand All @@ -30,21 +32,7 @@
)
from data_designer.config.utils.info import InfoType, InterfaceInfo
from data_designer.config.utils.io_helpers import write_seed_dataset
from data_designer.engine.analysis.dataset_profiler import (
DataDesignerDatasetProfiler,
DatasetProfilerConfig,
)
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder
from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
from data_designer.engine.model_provider import resolve_model_provider_registry
from data_designer.engine.models.registry import create_model_registry
from data_designer.engine.resources.managed_storage import init_managed_blob_storage
from data_designer.engine.resources.resource_provider import ResourceProvider
from data_designer.engine.resources.seed_dataset_data_store import (
HfHubSeedDatasetDataStore,
LocalSeedDatasetDataStore,
)
from data_designer.engine.secret_resolver import (
CompositeResolver,
EnvironmentResolver,
Expand Down Expand Up @@ -119,7 +107,7 @@ def make_seed_reference_from_file(file_path: str | Path) -> LocalSeedDatasetRefe

@classmethod
def make_seed_reference_from_dataframe(
cls, dataframe: pd.DataFrame, file_path: str | Path
cls, dataframe: lazy_imports.pd.DataFrame, file_path: str | Path
) -> LocalSeedDatasetReference:
"""Create a seed dataset reference from a pandas DataFrame.

Expand Down Expand Up @@ -239,7 +227,7 @@ def preview(

dropped_columns = raw_dataset.columns.difference(processed_dataset.columns)
if len(dropped_columns) > 0:
dataset_for_profiler = pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1)
dataset_for_profiler = lazy_imports.pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1)
else:
dataset_for_profiler = processed_dataset

Expand All @@ -251,7 +239,7 @@ def preview(

if builder.artifact_storage.processors_outputs_path.exists():
processor_artifacts = {
processor_config.name: pd.read_parquet(
processor_config.name: lazy_imports.pd.read_parquet(
builder.artifact_storage.processors_outputs_path / f"{processor_config.name}.parquet",
dtype_backend="pyarrow",
).to_dict(orient="records")
Expand Down Expand Up @@ -334,19 +322,19 @@ def _resolve_model_providers(self, model_providers: list[ModelProvider] | None)
return model_providers or []

def _create_dataset_builder(
self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
) -> ColumnWiseDatasetBuilder:
return ColumnWiseDatasetBuilder(
column_configs=compile_dataset_builder_column_configs(config_builder.build(raise_exceptions=True)),
self, config_builder: DataDesignerConfigBuilder, resource_provider: engine.ResourceProvider
) -> engine.ColumnWiseDatasetBuilder:
return engine.ColumnWiseDatasetBuilder(
column_configs=engine.compile_dataset_builder_column_configs(config_builder.build(raise_exceptions=True)),
processor_configs=config_builder.get_processor_configs(),
resource_provider=resource_provider,
)

def _create_dataset_profiler(
self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
) -> DataDesignerDatasetProfiler:
return DataDesignerDatasetProfiler(
config=DatasetProfilerConfig(
self, config_builder: DataDesignerConfigBuilder, resource_provider: engine.ResourceProvider
) -> engine.DataDesignerDatasetProfiler:
return engine.DataDesignerDatasetProfiler(
config=engine.DatasetProfilerConfig(
column_configs=config_builder.get_column_configs(),
column_profiler_configs=config_builder.get_profilers(),
),
Expand All @@ -355,21 +343,21 @@ def _create_dataset_profiler(

def _create_resource_provider(
self, dataset_name: str, config_builder: DataDesignerConfigBuilder
) -> ResourceProvider:
) -> engine.ResourceProvider:
model_configs = config_builder.model_configs
ArtifactStorage.mkdir_if_needed(self._artifact_path)
return ResourceProvider(
artifact_storage=ArtifactStorage(artifact_path=self._artifact_path, dataset_name=dataset_name),
model_registry=create_model_registry(
engine.ArtifactStorage.mkdir_if_needed(self._artifact_path)
return engine.ResourceProvider(
artifact_storage=engine.ArtifactStorage(artifact_path=self._artifact_path, dataset_name=dataset_name),
model_registry=engine.create_model_registry(
model_configs=model_configs,
model_provider_registry=self._model_provider_registry,
secret_resolver=self._secret_resolver,
),
blob_storage=init_managed_blob_storage(str(self._managed_assets_path)),
blob_storage=engine.init_managed_blob_storage(str(self._managed_assets_path)),
datastore=(
LocalSeedDatasetDataStore()
engine.LocalSeedDatasetDataStore()
if (settings := config_builder.get_seed_datastore_settings()) is None
else HfHubSeedDatasetDataStore(
else engine.HfHubSeedDatasetDataStore(
endpoint=settings.endpoint,
token=settings.token,
)
Expand Down
43 changes: 43 additions & 0 deletions src/data_designer/lazy_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Lazy imports facade for heavy third-party dependencies.

This module provides a centralized facade that lazily imports heavy dependencies
(pandas, pyarrow, etc.) only when accessed, significantly improving import performance.

Usage:
from data_designer import lazy_imports

df = lazy_imports.pd.DataFrame(...)
schema = lazy_imports.pq.read_schema(...)
"""


def __getattr__(name: str) -> object:
"""Lazily import heavy third-party dependencies when accessed.

This allows fast imports of data_designer while deferring loading of heavy
libraries like pandas and pyarrow until they're actually needed.

Supported imports:
- pd: pandas module
- pq: pyarrow.parquet module
"""
if name == "pd":
import pandas as pd

return pd
elif name == "pq":
import pyarrow.parquet as pq

return pq

raise AttributeError(f"module 'data_designer.lazy_imports' has no attribute {name!r}")


# For type checking
def __dir__() -> list[str]:
"""Return list of available lazy imports."""
return ["pd", "pq"]
6 changes: 3 additions & 3 deletions tests/config/test_datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def test_get_file_column_names_with_filesystem_parquet():
mock_schema = MagicMock()
mock_schema.names = ["col1", "col2", "col3"]

with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema:
with patch("data_designer.lazy_imports.pq.read_schema") as mock_read_schema:
mock_read_schema.return_value = mock_schema
result = get_file_column_names("datasets/test/file.parquet", "parquet")

Expand Down Expand Up @@ -161,11 +161,11 @@ def test_get_file_column_names_error_handling():
with pytest.raises(InvalidFilePathError, match="🛑 Unsupported file type: 'txt'"):
get_file_column_names("test.txt", "txt")

with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema:
with patch("data_designer.lazy_imports.pq.read_schema") as mock_read_schema:
mock_read_schema.side_effect = Exception("Test error")
assert get_file_column_names("test.txt", "parquet") == []

with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema:
with patch("data_designer.lazy_imports.pq.read_schema") as mock_read_schema:
mock_col1 = MagicMock()
mock_col1.name = "col1"
mock_col2 = MagicMock()
Expand Down