Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dataframely/_base_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ class Metadata:
rules: dict[str, RuleFactory] = field(default_factory=dict)

def update(self, other: Self) -> None:
if duplicated_column_names := self.columns.keys() & other.columns.keys():
raise ImplementationError(
f"Columns {duplicated_column_names} are duplicated."
Comment on lines +88 to +89
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exception message uses the raw set representation of duplicated_column_names (derived from key-set intersection). For multiple duplicates this will be non-deterministically ordered, which can make debugging and tests brittle. Consider formatting a stable, sorted list of duplicated names (and optionally quoting each name) instead of interpolating the set directly.

Suggested change
raise ImplementationError(
f"Columns {duplicated_column_names} are duplicated."
formatted_names = ", ".join(
sorted(repr(name) for name in duplicated_column_names)
)
raise ImplementationError(
f"Columns {formatted_names} are duplicated."

Copilot uses AI. Check for mistakes.
)
Comment on lines +87 to +90
Copy link

Copilot AI Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metadata.update() now raises on any overlapping column keys between self and other. Because _get_metadata_recursively() traverses __bases__ recursively (not the MRO), diamond / multiple-inheritance patterns can visit the same ancestor more than once (e.g. A->B, A->C, D(B, C)) and would now raise even though the duplicated columns come from the same original definition. Consider de-duplicating bases during metadata collection (e.g. walk a linearized MRO once) or, at minimum, allow overlaps where the column objects are identical and only raise when the overlapping key maps to different Column instances.

Suggested change
if duplicated_column_names := self.columns.keys() & other.columns.keys():
raise ImplementationError(
f"Columns {duplicated_column_names} are duplicated."
)
"""Merge another Metadata instance into this one.
Overlapping keys are allowed if and only if they refer to the *same*
underlying object. This accommodates multiple-inheritance / diamond
patterns where the same base schema is visited more than once.
"""
# Detect conflicting column definitions: same name, different Column instance
duplicated_column_names = self.columns.keys() & other.columns.keys()
conflicting_columns = {
name
for name in duplicated_column_names
if self.columns[name] is not other.columns[name]
}
if conflicting_columns:
raise ImplementationError(
f"Columns {conflicting_columns} are duplicated with conflicting definitions."
)
# Detect conflicting rule factories: same name, different object
duplicated_rule_names = self.rules.keys() & other.rules.keys()
conflicting_rules = {
name
for name in duplicated_rule_names
if self.rules[name] is not other.rules[name]
}
if conflicting_rules:
raise ImplementationError(
f"Rules {conflicting_rules} are duplicated with conflicting definitions."
)

Copilot uses AI. Check for mistakes.
self.columns.update(other.columns)
self.rules.update(other.rules)

Expand Down Expand Up @@ -203,6 +207,8 @@ def _get_metadata(source: dict[str, Any]) -> Metadata:
k: v for k, v in source.items() if not k.startswith("__")
}.items():
if isinstance(value, Column):
if (col_name := value.alias or attr) in result.columns:
raise ImplementationError(f"Column {col_name!r} is duplicated.")
result.columns[value.alias or attr] = value
if isinstance(value, RuleFactory):
# We must ensure that custom rules do not clash with internal rules.
Expand Down
20 changes: 20 additions & 0 deletions tests/columns/test_alias.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
# SPDX-License-Identifier: BSD-3-Clause

import polars as pl
import pytest

import dataframely as dy
from dataframely.exc import ImplementationError


class AliasSchema(dy.Schema):
Expand Down Expand Up @@ -36,3 +38,21 @@ def test_alias_unset() -> None:
no_alias_col = dy.Int32()
assert no_alias_col.alias is None
assert no_alias_col.name == ""


def test_duplicate_alias_same_schema() -> None:
with pytest.raises(ImplementationError, match="'a' is duplicated"):

class MySchema(dy.Schema):
a = dy.Int64(alias="a")
b = dy.String(alias="a")


def test_duplicate_alias_inherited_schema() -> None:
class MySchema(dy.Schema):
a = dy.Int64(alias="a")

with pytest.raises(ImplementationError, match="'a'.*duplicated"):

class MySchema2(MySchema):
b = dy.Int64(alias="a")
Loading