Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/python/lance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@
ts_types = Union[datetime, pd.Timestamp, str]


class FieldNotFoundError(Exception):
"""Raised when a requested field is not found in the schema."""

pass


__all__ = [
"Blob",
"BlobArray",
Expand All @@ -59,6 +65,7 @@
"blob_field",
"DatasetBasePath",
"DataStatistics",
"FieldNotFoundError",
"FieldStatistics",
"FragmentMetadata",
"Index",
Expand Down
30 changes: 30 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import pyarrow.parquet as pq
import pytest
from helper import ProgressForTest
from lance import FieldNotFoundError
from lance._dataset.sharded_batch_iterator import ShardedBatchIterator
from lance.commit import CommitConflictError
from lance.dataset import LANCE_COMMIT_MESSAGE_KEY, AutoCleanupConfig
Expand Down Expand Up @@ -5094,3 +5095,32 @@ def test_default_scan_options_nearest(tmp_path: Path) -> None:
assert distances == sorted(distances)

assert "id" in result.column_names


def test_file_not_field_not_found_error():
data = pa.table(
{
"id": [1, 2, 3],
"foo": ["a", "b", "c"],
"bar": [0.1, 0.2, 0.3],
}
)
ds = lance.write_dataset(data, "memory://")

with pytest.raises(FieldNotFoundError) as e:
ds.scanner(columns=["non_existent_field"]).to_table()
# Too different to suggest anything
expected = (
"FieldNotFoundError: Field 'non_existent_field' not found."
"\nAvailable fields: ['id', 'foo', 'bar']"
)
assert str(e.value) == expected

with pytest.raises(FieldNotFoundError) as e:
ds.scanner(columns=["baz"]).to_table()
# Close enough to suggest 'bar'
expected = (
"FieldNotFoundError: Field 'baz' not found. Did you mean 'bar'?"
"\nAvailable fields: ['id', 'foo', 'bar']"
)
assert str(e.value) == expected
30 changes: 29 additions & 1 deletion python/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::sync::LazyLock;

use lance_namespace::error::NamespaceError;
use pyo3::{
exceptions::{PyIOError, PyNotImplementedError, PyRuntimeError, PyValueError},
types::{PyAnyMethods, PyModule},
BoundObject, PyErr, PyResult, Python,
BoundObject, Py, PyAny, PyErr, PyResult, Python,
};

use lance::Error as LanceError;

/// Lazily loads the FieldNotFoundError exception from the lance Python module.
static PY_FIELD_NOT_FOUND_ERROR: LazyLock<PyResult<Py<PyAny>>> = LazyLock::new(|| {
Python::attach(|py| {
py.import("lance")
.and_then(|lance| lance.getattr("FieldNotFoundError"))
.map(|err| err.unbind())
})
});

/// Try to convert a NamespaceError to the corresponding Python exception.
/// Returns the appropriate Python exception from lance_namespace.errors module.
fn namespace_error_to_pyerr(py: Python<'_>, ns_err: &NamespaceError) -> PyErr {
Expand Down Expand Up @@ -79,6 +90,23 @@ impl<T> PythonErrorExt<T> for std::result::Result<T, LanceError> {
LanceError::NotFound { .. } => self.value_error(),
LanceError::RefNotFound { .. } => self.value_error(),
LanceError::VersionNotFound { .. } => self.value_error(),
LanceError::FieldNotFound { source } => {
let msg = source.to_string();
match &*PY_FIELD_NOT_FOUND_ERROR {
Ok(exc_type) => Python::attach(|py| {
let exc_type = exc_type.bind(py);
Err(PyErr::from_value(
exc_type.call1((msg,)).expect("Failed to create exception"),
))
}),
Err(_) => {
log::warn!(
"Failed to import FieldNotFoundError from lance module, falling back to ValueError"
);
self.value_error()
}
}
}
LanceError::Namespace { source, .. } => {
// Try to downcast to NamespaceError and convert to proper Python exception
if let Some(ns_err) = source.downcast_ref::<NamespaceError>() {
Expand Down
50 changes: 37 additions & 13 deletions rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,9 @@ impl FieldRef<'_> {
Ok(id)
}
FieldRef::ByPath(path) => {
let field = schema.field(path).ok_or_else(|| Error::InvalidInput {
source: format!("Field '{}' not found in schema", path).into(),
location: location!(),
})?;
let field = schema
.field(path)
.ok_or_else(|| Error::field_not_found(path, schema.field_paths()))?;
Ok(field.id)
}
}
Expand Down Expand Up @@ -235,10 +234,7 @@ impl Schema {
candidates.push(projected_field)
}
} else if err_on_missing && first != ROW_ID && first != ROW_ADDR {
return Err(Error::Schema {
message: format!("Column {} does not exist", col.as_ref()),
location: location!(),
});
return Err(Error::field_not_found(col.as_ref(), self.field_paths()));
}
}

Expand Down Expand Up @@ -347,6 +343,27 @@ impl Schema {
SchemaFieldIterPreOrder::new(self)
}

/// Get all field paths in the schema as a list of strings.
///
/// This returns all field paths in the schema, including nested fields.
/// For example, if there's a struct field "user" with a field "name",
/// this will return "user.name" as one of the paths.
pub fn field_paths(&self) -> Vec<String> {
let mut paths = Vec::new();
for field in self.fields_pre_order() {
let ancestry = self.field_ancestry_by_id(field.id);
if let Some(ancestry) = ancestry {
let path = ancestry
.iter()
.map(|f| f.name.as_str())
.collect::<Vec<_>>()
.join(".");
paths.push(path);
}
}
paths
}

/// Returns a new schema that only contains the fields in `column_ids`.
///
/// This projection can filter out both top-level and nested fields
Expand Down Expand Up @@ -507,12 +524,19 @@ impl Schema {

// TODO: This is not a public API, change to pub(crate) after refactor is done.
pub fn field_id(&self, column: &str) -> Result<i32> {
self.field(column)
.map(|f| f.id)
.ok_or_else(|| Error::Schema {
message: "Vector column not in schema".to_string(),
self.field(column).map(|f| f.id).ok_or_else(|| {
let paths = self.field_paths();
let field_paths: Vec<&str> = paths.iter().map(|s| s.as_str()).collect();
let suggestion = crate::levenshtein::find_best_suggestion(column, &field_paths);
let mut error_msg = format!("Vector column '{}' not in schema", column);
if let Some(suggestion) = suggestion {
error_msg = format!("{}. Did you mean '{}'?", error_msg, suggestion);
}
Error::Schema {
message: error_msg.to_string(),
location: location!(),
})
}
})
}

pub fn top_level_field_ids(&self) -> Vec<i32> {
Expand Down
55 changes: 54 additions & 1 deletion rust/lance-core/src/error.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::fmt;

use arrow_schema::ArrowError;
use snafu::{Location, Snafu};

type BoxedError = Box<dyn std::error::Error + Send + Sync + 'static>;

/// Error for when a requested field is not found in a schema.
///
/// This error computes suggestions lazily (only when displayed) to avoid
/// computing Levenshtein distance when the error is created but never shown.
#[derive(Debug)]
pub struct FieldNotFoundError {
pub field_name: String,
pub candidates: Vec<String>,
}

impl fmt::Display for FieldNotFoundError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Field '{}' not found.", self.field_name)?;
let suggestion =
crate::levenshtein::find_best_suggestion(&self.field_name, &self.candidates);
if let Some(suggestion) = suggestion {
write!(f, " Did you mean '{}'?", suggestion)?;
}
write!(f, "\nAvailable fields: [")?;
for (i, candidate) in self.candidates.iter().take(10).enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "'{}'", candidate)?;
}
if self.candidates.len() > 10 {
let remaining = self.candidates.len() - 10;
write!(f, ", ... and {} more]", remaining)?;
} else {
write!(f, "]")?;
}
Ok(())
}
}

impl std::error::Error for FieldNotFoundError {}

/// Allocates error on the heap and then places `e` into it.
#[inline]
pub fn box_error(e: impl std::error::Error + Send + Sync + 'static) -> BoxedError {
Expand Down Expand Up @@ -125,6 +164,10 @@ pub enum Error {
/// or inspected using [`Error::external_source`].
#[snafu(transparent)]
External { source: BoxedError },

/// A requested field was not found in a schema.
#[snafu(transparent)]
FieldNotFound { source: FieldNotFoundError },
}

impl Error {
Expand Down Expand Up @@ -197,6 +240,16 @@ impl Error {
Self::External { source }
}

/// Create a FieldNotFound error with the given field name and available candidates.
pub fn field_not_found(field_name: impl Into<String>, candidates: Vec<String>) -> Self {
Self::FieldNotFound {
source: FieldNotFoundError {
field_name: field_name.into(),
candidates,
},
}
}

/// Returns a reference to the external error source if this is an `External` variant.
///
/// This allows downcasting to recover the original error type.
Expand Down Expand Up @@ -246,7 +299,7 @@ impl ToSnafuLocation for std::panic::Location<'static> {
}
}

pub type Result<T> = std::result::Result<T, Error>;
pub type Result<T, E = Error> = std::result::Result<T, E>;
pub type ArrowResult<T> = std::result::Result<T, ArrowError>;
#[cfg(feature = "datafusion")]
pub type DataFusionResult<T> = std::result::Result<T, datafusion_common::DataFusionError>;
Expand Down
Loading