diff --git a/src/data_designer/config/datastore.py b/src/data_designer/config/datastore.py index ab78bae4..01606fd4 100644 --- a/src/data_designer/config/datastore.py +++ b/src/data_designer/config/datastore.py @@ -7,11 +7,10 @@ from pathlib import Path from typing import TYPE_CHECKING -import pandas as pd -import pyarrow.parquet as pq from huggingface_hub import HfApi, HfFileSystem from pydantic import BaseModel, Field +from data_designer import lazy_imports from data_designer.config.errors import InvalidConfigError, InvalidFileFormatError, InvalidFilePathError from data_designer.config.utils.io_helpers import VALID_DATASET_FILE_EXTENSIONS, validate_path_contains_files_of_type @@ -46,7 +45,7 @@ def get_file_column_names(file_reference: str | Path | HfFileSystem, file_type: """ if file_type == "parquet": try: - schema = pq.read_schema(file_reference) + schema = lazy_imports.pq.read_schema(file_reference) if hasattr(schema, "names"): return schema.names else: @@ -55,12 +54,12 @@ def get_file_column_names(file_reference: str | Path | HfFileSystem, file_type: logger.warning(f"Failed to process parquet file {file_reference}: {e}") return [] elif file_type in ["json", "jsonl"]: - return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist() + return lazy_imports.pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist() elif file_type == "csv": try: - df = pd.read_csv(file_reference, nrows=1) + df = lazy_imports.pd.read_csv(file_reference, nrows=1) return df.columns.tolist() - except (pd.errors.EmptyDataError, pd.errors.ParserError) as e: + except (lazy_imports.pd.errors.EmptyDataError, lazy_imports.pd.errors.ParserError) as e: logger.warning(f"Failed to process CSV file {file_reference}: {e}") return [] else: diff --git a/src/data_designer/config/sampler_params.py b/src/data_designer/config/sampler_params.py index 0bb03451..ce7b3a40 100644 --- a/src/data_designer/config/sampler_params.py +++ b/src/data_designer/config/sampler_params.py @@ -4,10 +4,10 @@ from enum import Enum from typing import Literal -import pandas as pd from pydantic import Field, field_validator, model_validator from typing_extensions import Self, TypeAlias +from data_designer import lazy_imports from data_designer.config.base import ConfigBase from data_designer.config.utils.constants import ( AVAILABLE_LOCALES, @@ -113,7 +113,7 @@ class DatetimeSamplerParams(ConfigBase): @classmethod def _validate_param_is_datetime(cls, value: str) -> str: try: - pd.to_datetime(value) + lazy_imports.pd.to_datetime(value) except ValueError: raise ValueError(f"Invalid datetime format: {value}") return value diff --git a/src/data_designer/engine/__init__.py b/src/data_designer/engine/__init__.py index 1a8431c3..848a7e17 100644 --- a/src/data_designer/engine/__init__.py +++ b/src/data_designer/engine/__init__.py @@ -1,2 +1,108 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 + +""" +Engine module with fully automatic lazy loading. + +This module automatically discovers ALL engine modules and their public classes/functions, +providing a facade that lazily imports components only when accessed. This significantly +improves import performance while requiring ZERO maintenance - just add a module and it's +automatically exported. + +Note: Private modules (starting with _) are excluded from auto-discovery. +""" + +import ast +import importlib +from pathlib import Path + + +def _discover_all_engine_exports() -> dict[str, tuple[str, str]]: + """Automatically discover all public classes/functions in the engine package. + + Scans the engine directory recursively for all Python files, parses them + with AST (without importing), and builds a mapping of all public exports. + + Returns: + Dictionary mapping public names to (module_path, attribute_name) tuples. + """ + lazy_imports = {} + engine_dir = Path(__file__).parent + + # Find all Python files in engine directory recursively + for py_file in engine_dir.rglob("*.py"): + # Skip __init__.py files and private modules (starting with _) + if py_file.name.startswith("_"): + continue + + # Convert file path to module path + # e.g., dataset_builders/column_wise_builder.py -> data_designer.engine.dataset_builders.column_wise_builder + rel_path = py_file.relative_to(engine_dir.parent) + module_parts = list(rel_path.parts[:-1]) + [rel_path.stem] + module_path = ".".join(["data_designer"] + module_parts) + + try: + # Parse the Python file with AST (doesn't import it - fast!) + with open(py_file, "r", encoding="utf-8") as f: + tree = ast.parse(f.read(), filename=str(py_file)) + + # Find all top-level public classes and functions + for node in tree.body: + if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)): + name = node.name + # Only export public items (no leading underscore) + if not name.startswith("_"): + # Avoid name collisions - first one wins + if name not in lazy_imports: + lazy_imports[name] = (module_path, name) + except Exception: + # If AST parsing fails, skip this module silently + pass + + return lazy_imports + + +# Cache for lazy imports - built on first access +_LAZY_IMPORTS_CACHE: dict[str, tuple[str, str]] | None = None + + +def __getattr__(name: str) -> object: + """Lazily import engine components when accessed. + + On first access, automatically discovers all public classes/functions in the + engine package. Subsequent accesses use the cached mapping for fast lookups. + + Args: + name: The name of the attribute to import. + + Returns: + The imported class, function, or object. + + Raises: + AttributeError: If the attribute is not found in any engine module. + """ + global _LAZY_IMPORTS_CACHE + + # Build cache on first access + if _LAZY_IMPORTS_CACHE is None: + _LAZY_IMPORTS_CACHE = _discover_all_engine_exports() + + if name in _LAZY_IMPORTS_CACHE: + module_path, attr_name = _LAZY_IMPORTS_CACHE[name] + # Dynamically import the module + module = importlib.import_module(module_path) + # Get the attribute from the module + return getattr(module, attr_name) + + raise AttributeError(f"module 'data_designer.engine' has no attribute {name!r}") + + +def __dir__() -> list[str]: + """Return list of all available lazy imports for introspection.""" + global _LAZY_IMPORTS_CACHE + + # Build cache if not already built + if _LAZY_IMPORTS_CACHE is None: + _LAZY_IMPORTS_CACHE = _discover_all_engine_exports() + + return list(_LAZY_IMPORTS_CACHE.keys()) diff --git a/src/data_designer/interface/data_designer.py b/src/data_designer/interface/data_designer.py index 0458dc60..c2c16a24 100644 --- a/src/data_designer/interface/data_designer.py +++ b/src/data_designer/interface/data_designer.py @@ -1,11 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from pathlib import Path -import pandas as pd - +# Lazy-loaded third-party and engine components via facades +from data_designer import engine, lazy_imports from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.default_model_settings import ( @@ -30,21 +32,7 @@ ) from data_designer.config.utils.info import InfoType, InterfaceInfo from data_designer.config.utils.io_helpers import write_seed_dataset -from data_designer.engine.analysis.dataset_profiler import ( - DataDesignerDatasetProfiler, - DatasetProfilerConfig, -) -from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage -from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder -from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs from data_designer.engine.model_provider import resolve_model_provider_registry -from data_designer.engine.models.registry import create_model_registry -from data_designer.engine.resources.managed_storage import init_managed_blob_storage -from data_designer.engine.resources.resource_provider import ResourceProvider -from data_designer.engine.resources.seed_dataset_data_store import ( - HfHubSeedDatasetDataStore, - LocalSeedDatasetDataStore, -) from data_designer.engine.secret_resolver import ( CompositeResolver, EnvironmentResolver, @@ -119,7 +107,7 @@ def make_seed_reference_from_file(file_path: str | Path) -> LocalSeedDatasetRefe @classmethod def make_seed_reference_from_dataframe( - cls, dataframe: pd.DataFrame, file_path: str | Path + cls, dataframe: lazy_imports.pd.DataFrame, file_path: str | Path ) -> LocalSeedDatasetReference: """Create a seed dataset reference from a pandas DataFrame. @@ -239,7 +227,7 @@ def preview( dropped_columns = raw_dataset.columns.difference(processed_dataset.columns) if len(dropped_columns) > 0: - dataset_for_profiler = pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1) + dataset_for_profiler = lazy_imports.pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1) else: dataset_for_profiler = processed_dataset @@ -251,7 +239,7 @@ def preview( if builder.artifact_storage.processors_outputs_path.exists(): processor_artifacts = { - processor_config.name: pd.read_parquet( + processor_config.name: lazy_imports.pd.read_parquet( builder.artifact_storage.processors_outputs_path / f"{processor_config.name}.parquet", dtype_backend="pyarrow", ).to_dict(orient="records") @@ -334,19 +322,19 @@ def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) return model_providers or [] def _create_dataset_builder( - self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider - ) -> ColumnWiseDatasetBuilder: - return ColumnWiseDatasetBuilder( - column_configs=compile_dataset_builder_column_configs(config_builder.build(raise_exceptions=True)), + self, config_builder: DataDesignerConfigBuilder, resource_provider: engine.ResourceProvider + ) -> engine.ColumnWiseDatasetBuilder: + return engine.ColumnWiseDatasetBuilder( + column_configs=engine.compile_dataset_builder_column_configs(config_builder.build(raise_exceptions=True)), processor_configs=config_builder.get_processor_configs(), resource_provider=resource_provider, ) def _create_dataset_profiler( - self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider - ) -> DataDesignerDatasetProfiler: - return DataDesignerDatasetProfiler( - config=DatasetProfilerConfig( + self, config_builder: DataDesignerConfigBuilder, resource_provider: engine.ResourceProvider + ) -> engine.DataDesignerDatasetProfiler: + return engine.DataDesignerDatasetProfiler( + config=engine.DatasetProfilerConfig( column_configs=config_builder.get_column_configs(), column_profiler_configs=config_builder.get_profilers(), ), @@ -355,21 +343,21 @@ def _create_dataset_profiler( def _create_resource_provider( self, dataset_name: str, config_builder: DataDesignerConfigBuilder - ) -> ResourceProvider: + ) -> engine.ResourceProvider: model_configs = config_builder.model_configs - ArtifactStorage.mkdir_if_needed(self._artifact_path) - return ResourceProvider( - artifact_storage=ArtifactStorage(artifact_path=self._artifact_path, dataset_name=dataset_name), - model_registry=create_model_registry( + engine.ArtifactStorage.mkdir_if_needed(self._artifact_path) + return engine.ResourceProvider( + artifact_storage=engine.ArtifactStorage(artifact_path=self._artifact_path, dataset_name=dataset_name), + model_registry=engine.create_model_registry( model_configs=model_configs, model_provider_registry=self._model_provider_registry, secret_resolver=self._secret_resolver, ), - blob_storage=init_managed_blob_storage(str(self._managed_assets_path)), + blob_storage=engine.init_managed_blob_storage(str(self._managed_assets_path)), datastore=( - LocalSeedDatasetDataStore() + engine.LocalSeedDatasetDataStore() if (settings := config_builder.get_seed_datastore_settings()) is None - else HfHubSeedDatasetDataStore( + else engine.HfHubSeedDatasetDataStore( endpoint=settings.endpoint, token=settings.token, ) diff --git a/src/data_designer/lazy_imports.py b/src/data_designer/lazy_imports.py new file mode 100644 index 00000000..456bd3b9 --- /dev/null +++ b/src/data_designer/lazy_imports.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Lazy imports facade for heavy third-party dependencies. + +This module provides a centralized facade that lazily imports heavy dependencies +(pandas, pyarrow, etc.) only when accessed, significantly improving import performance. + +Usage: + from data_designer import lazy_imports + + df = lazy_imports.pd.DataFrame(...) + schema = lazy_imports.pq.read_schema(...) +""" + + +def __getattr__(name: str) -> object: + """Lazily import heavy third-party dependencies when accessed. + + This allows fast imports of data_designer while deferring loading of heavy + libraries like pandas and pyarrow until they're actually needed. + + Supported imports: + - pd: pandas module + - pq: pyarrow.parquet module + """ + if name == "pd": + import pandas as pd + + return pd + elif name == "pq": + import pyarrow.parquet as pq + + return pq + + raise AttributeError(f"module 'data_designer.lazy_imports' has no attribute {name!r}") + + +# For type checking +def __dir__() -> list[str]: + """Return list of available lazy imports.""" + return ["pd", "pq"] diff --git a/tests/config/test_datastore.py b/tests/config/test_datastore.py index a2fe4d5f..0a7f8827 100644 --- a/tests/config/test_datastore.py +++ b/tests/config/test_datastore.py @@ -132,7 +132,7 @@ def test_get_file_column_names_with_filesystem_parquet(): mock_schema = MagicMock() mock_schema.names = ["col1", "col2", "col3"] - with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema: + with patch("data_designer.lazy_imports.pq.read_schema") as mock_read_schema: mock_read_schema.return_value = mock_schema result = get_file_column_names("datasets/test/file.parquet", "parquet") @@ -161,11 +161,11 @@ def test_get_file_column_names_error_handling(): with pytest.raises(InvalidFilePathError, match="🛑 Unsupported file type: 'txt'"): get_file_column_names("test.txt", "txt") - with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema: + with patch("data_designer.lazy_imports.pq.read_schema") as mock_read_schema: mock_read_schema.side_effect = Exception("Test error") assert get_file_column_names("test.txt", "parquet") == [] - with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema: + with patch("data_designer.lazy_imports.pq.read_schema") as mock_read_schema: mock_col1 = MagicMock() mock_col1.name = "col1" mock_col2 = MagicMock()