From 51eb0fff380466f99320e477965400b8afb896c9 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Wed, 13 May 2026 22:15:25 -0400 Subject: [PATCH 01/27] feat(contrib): add ContribOp proto envelope + Rust planner registry SPI First two pieces of the contrib extension SPI (PR1.1 + PR1.2 from docs/contrib-delta-migration-plan.md). No consumers yet -- the dispatcher arm, Scala SPI, and integration hooks land in subsequent commits on this branch. Proto envelope (operator.proto): - New ContribOp { kind: string, payload: bytes } message added as variant 117 on the OpStruct oneof. Contrib operators travel through this envelope so core's proto stays stable when contribs ship and evolve independently. Rust SPI (planner/contrib.rs): - register_contrib_planner(kind, planner: Arc) -- intended to be called from a contrib crate's #[ctor] at lib init time. Last-write-wins on duplicate kinds (test re-registration convenience; production contribs only register once). - lookup_contrib_planner_by_kind(kind) -> Option> -- read path the dispatcher (PR1.3) will use. - registered_contrib_kinds() -> Vec -- diagnostics. - 2 unit tests covering registration round-trip + duplicate-kind replacement; both pass. Exhaustive-match accommodations (operator_registry.rs, jni_api.rs): - operator_registry::get_operator_type returns None for ContribOp; PR1.3 will add the dispatcher arm in planner.rs that bypasses this lookup and goes through the contrib registry instead. - jni_api::op_name returns "ContribOp" for the new variant (informational tracing label). The reused trait is core's existing OperatorBuilder rather than a new ContribOperatorPlanner trait -- their signatures are identical and duplicating would force contribs to maintain a parallel definition. Co-Authored-By: Claude Opus 4.7 (1M context) --- native/core/src/execution/jni_api.rs | 5 + native/core/src/execution/planner.rs | 1 + native/core/src/execution/planner/contrib.rs | 143 ++++++++++++++++++ .../execution/planner/operator_registry.rs | 6 + native/proto/src/proto/operator.proto | 22 +++ 5 files changed, 177 insertions(+) create mode 100644 native/core/src/execution/planner/contrib.rs diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index f5b04cc51d..c35a9a6e25 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -232,6 +232,11 @@ fn op_name(op: &OpStruct) -> &'static str { OpStruct::Explode(_) => "Explode", OpStruct::CsvScan(_) => "CsvScan", OpStruct::ShuffleScan(_) => "ShuffleScan", + // Contrib operators carry their concrete identity in `ContribOp.kind`, but + // `op_name` returns `&'static str` for tracing/error messages. Keep the label + // generic here; downstream code that needs the specific contrib reads `kind` + // off the proto directly. + OpStruct::ContribOp(_) => "ContribOp", } } diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index b00f140026..cc4c145345 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -17,6 +17,7 @@ //! Converts Spark physical plan to DataFusion physical plan +pub mod contrib; pub mod expression_registry; pub mod macros; pub mod operator_registry; diff --git a/native/core/src/execution/planner/contrib.rs b/native/core/src/execution/planner/contrib.rs new file mode 100644 index 0000000000..f185446edb --- /dev/null +++ b/native/core/src/execution/planner/contrib.rs @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Registry for contrib operator planners. +//! +//! Contribs are extension crates that ship Spark plan operators living outside core (Delta, +//! example, future Hudi/DeltaSharing, etc.). They link into core's cdylib as Cargo `rlib` +//! dependencies enabled via core's Cargo feature flags (e.g. `contrib-delta`, +//! `contrib-example`). At library-init time (typically via `#[ctor]` in the contrib crate), +//! each contrib calls [`register_contrib_planner`] with a stable `kind` string and an +//! [`OperatorBuilder`] implementation. Core's `OpStruct::ContribOp` dispatcher arm then +//! looks up the planner by `kind` and delegates plan construction to it. +//! +//! See `docs/contrib-delta-migration-plan.md` for the broader architecture. + +use std::{ + collections::HashMap, + sync::{Arc, OnceLock, RwLock}, +}; + +use super::operator_registry::OperatorBuilder; + +/// Process-wide registry of contrib operator planners, keyed by `ContribOp.kind`. +/// +/// Implemented as an `OnceLock>` so: +/// * The OnceLock makes lazy first-touch initialisation thread-safe. +/// * The inner RwLock allows multiple contribs to register concurrently at lib-init time +/// (e.g. independent `#[ctor]` invocations) without blocking subsequent reads. +/// +/// Registration is cheap and happens once per contrib per process; lookups are read-mostly. +fn registry() -> &'static RwLock>> { + static REGISTRY: OnceLock>>> = OnceLock::new(); + REGISTRY.get_or_init(|| RwLock::new(HashMap::new())) +} + +/// Register a contrib operator planner under the given `kind` identifier. +/// +/// `kind` must match the value the contrib's JVM-side serde writes into the +/// `ContribOp.kind` proto field. Convention: lowercase-hyphenated, prefixed by the +/// contrib's short name (e.g. `delta-scan`, `example-constant-scan`). +/// +/// If a planner is already registered for `kind`, this REPLACES it and logs a warning. +/// Last-write-wins lets test harnesses re-register without restarting the JVM, and +/// production contribs only ever register once per process. +/// +/// Thread-safe; intended to be called from a contrib's `#[ctor]` at library init. +pub fn register_contrib_planner(kind: impl Into, planner: Arc) { + let kind = kind.into(); + let mut guard = registry() + .write() + .expect("contrib planner registry poisoned"); + if guard.contains_key(&kind) { + log::warn!( + "register_contrib_planner: replacing existing planner for kind={kind:?}; \ + second registration usually indicates a misconfigured test harness" + ); + } + guard.insert(kind, planner); +} + +/// Look up the contrib planner registered for `kind`, or `None` if no contrib is loaded +/// for that operator. The native dispatcher arm in `planner.rs` uses this to route +/// `OpStruct::ContribOp` payloads. +pub fn lookup_contrib_planner_by_kind(kind: &str) -> Option> { + let guard = registry() + .read() + .expect("contrib planner registry poisoned"); + guard.get(kind).cloned() +} + +/// Return a snapshot of all registered contrib kinds. Useful for diagnostics and tests. +pub fn registered_contrib_kinds() -> Vec { + let guard = registry() + .read() + .expect("contrib planner registry poisoned"); + let mut kinds: Vec = guard.keys().cloned().collect(); + kinds.sort(); + kinds +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::execution::operators::{ExecutionError, ScanExec, ShuffleScanExec}; + use crate::execution::planner::PhysicalPlanner; + use crate::execution::spark_plan::SparkPlan; + use datafusion_comet_proto::spark_operator::Operator; + use jni::objects::{Global, JObject}; + + /// Trivial test planner that returns a not-implemented error. We don't need a real + /// ExecutionPlan to validate the registry; only identity-by-kind matters. + struct NoopBuilder(&'static str); + impl OperatorBuilder for NoopBuilder { + fn build( + &self, + _spark_plan: &Operator, + _inputs: &mut Vec>>>, + _partition_count: usize, + _planner: &PhysicalPlanner, + ) -> Result<(Vec, Vec, Arc), ExecutionError> { + Err(ExecutionError::GeneralError(format!( + "NoopBuilder({}) -- registry round-trip ok", + self.0 + ))) + } + } + + #[test] + fn register_and_lookup_round_trips_by_kind() { + register_contrib_planner("test-kind-a", Arc::new(NoopBuilder("a"))); + register_contrib_planner("test-kind-b", Arc::new(NoopBuilder("b"))); + + assert!(lookup_contrib_planner_by_kind("test-kind-a").is_some()); + assert!(lookup_contrib_planner_by_kind("test-kind-b").is_some()); + assert!(lookup_contrib_planner_by_kind("test-kind-c").is_none()); + + let kinds = registered_contrib_kinds(); + assert!(kinds.contains(&"test-kind-a".to_string())); + assert!(kinds.contains(&"test-kind-b".to_string())); + } + + #[test] + fn registering_existing_kind_replaces() { + register_contrib_planner("test-replace-kind", Arc::new(NoopBuilder("first"))); + // Second registration should not panic; replaces silently (with a warn-level log). + register_contrib_planner("test-replace-kind", Arc::new(NoopBuilder("second"))); + assert!(lookup_contrib_planner_by_kind("test-replace-kind").is_some()); + } +} diff --git a/native/core/src/execution/planner/operator_registry.rs b/native/core/src/execution/planner/operator_registry.rs index eb31184461..81d5151717 100644 --- a/native/core/src/execution/planner/operator_registry.rs +++ b/native/core/src/execution/planner/operator_registry.rs @@ -151,5 +151,11 @@ fn get_operator_type(spark_operator: &Operator) -> Option { OpStruct::Explode(_) => None, // Not yet in OperatorType enum OpStruct::CsvScan(_) => Some(OperatorType::CsvScan), OpStruct::ShuffleScan(_) => None, // Not yet in OperatorType enum + // Contrib operators go through the contrib registry instead, keyed by + // ContribOp.kind. Returning None here keeps `OperatorRegistry::can_handle` false + // for contribs so they don't get caught by the in-tree registry; the dispatch + // arm in `planner.rs` for OpStruct::ContribOp handles them explicitly via + // `lookup_contrib_planner_by_kind`. + OpStruct::ContribOp(_) => None, } } diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index 7cefe06da7..da6dad9f74 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -53,9 +53,31 @@ message Operator { Explode explode = 114; CsvScan csv_scan = 115; ShuffleScan shuffle_scan = 116; + // Generic envelope for operators contributed by an extension. The contrib's JVM-side + // serde fills the payload with a contrib-private proto message; core's native planner + // dispatches to a `ContribOperatorPlanner` registered (at lib-init time) by the + // contrib's Rust crate, keyed by `kind`. Lets core stay format-agnostic while contrib + // authors evolve their own wire format on a separate cadence. + ContribOp contrib_op = 117; } } +// Envelope for a contrib-contributed operator. Core's native planner dispatches by `kind` +// to a `ContribOperatorPlanner` registered at library-init time (see +// `native/core/src/execution/planner/contrib.rs`); the `payload` is opaque to core -- +// the contrib's planner decodes it into its own proto type. Each contrib ships: +// * a JVM JAR (Scala extension code + ServiceLoader entry), discovered via classpath +// * a Rust crate (rlib) compiled INTO core's cdylib via a Cargo feature flag on core, +// not shipped as a separate cdylib +// Reusing the same envelope for every contrib keeps core's proto stable when contribs +// ship/evolve independently. +message ContribOp { + // Stable identifier the contrib registered under (e.g. "delta-scan", "example-constant"). + string kind = 1; + // Contrib-private payload bytes. Format defined by the contrib's own proto schema. + bytes payload = 2; +} + message SparkPartitionedFile { string file_path = 1; int64 start = 2; From f448693b35e8416ba9c33b30e14ccd6f7f57320c Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Wed, 13 May 2026 22:16:40 -0400 Subject: [PATCH 02/27] feat(contrib): wire OpStruct::ContribOp dispatcher in native planner PR1.3 from docs/contrib-delta-migration-plan.md. The Rust planner now dispatches OpStruct::ContribOp through the registry added in PR1.2 by calling lookup_contrib_planner_by_kind(kind) and delegating to the returned OperatorBuilder. When no planner is registered for the kind, surfaces a clear ExecutionError that names the missing Cargo feature -- this is the typical case when the contrib's JVM JAR is on the classpath but core was built without the matching `contrib-` feature. No behaviour change for any existing operator. Contribs activate once their rlib is linked into core's cdylib via the Cargo feature and their #[ctor] runs at lib-init time. PR1.7 (contrib/example/) will exercise this end-to-end with the first concrete contrib. Co-Authored-By: Claude Opus 4.7 (1M context) --- native/core/src/execution/planner.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index cc4c145345..33c262a25b 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1960,6 +1960,25 @@ impl PhysicalPlanner { )), )) } + OpStruct::ContribOp(contrib_op) => { + // Dispatch the ContribOp envelope to a contrib-registered planner keyed + // by `kind`. The contrib's #[ctor] in its rlib (linked into core's cdylib + // via a Cargo feature flag) populates the registry at lib-init time, so + // by the time we reach this arm the registry is already warm. If no + // planner is registered for this kind, surface a clear error -- typically + // means the contrib's JVM JAR is on the classpath but core was built + // without the corresponding `contrib-` Cargo feature. + use crate::execution::planner::contrib::lookup_contrib_planner_by_kind; + let kind = contrib_op.kind.as_str(); + let planner = lookup_contrib_planner_by_kind(kind).ok_or_else(|| { + GeneralError(format!( + "No contrib planner registered for ContribOp.kind={kind:?}; \ + did you build core with the corresponding `contrib-{kind}` \ + Cargo feature (or its workspace equivalent)?" + )) + })?; + planner.build(spark_plan, inputs, partition_count, self) + } _ => Err(GeneralError(format!( "Unsupported or unregistered operator type: {:?}", spark_plan.op_struct From f23500df4e1dcf55a9c5ba909a6509d2b3e4a790 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Wed, 13 May 2026 22:21:09 -0400 Subject: [PATCH 03/27] feat(contrib): add Scala extension SPI under org.apache.comet.spi PR1.4 from docs/contrib-delta-migration-plan.md. Three new files under spark/src/main/scala/org/apache/comet/spi/: CometScanRuleExtension.scala Trait that contrib JARs implement to intercept scan transformation in CometScanRule. Exposes both matchesV1/transformV1 (FileSourceScanExec) and matchesV2/transformV2 (BatchScanExec) overrides; both default to "doesn't match", letting contribs claim only the scan flavour they care about. CometOperatorSerdeExtension.scala Trait that contribs implement to contribute additional SparkPlan-class to CometOperatorSerde mappings. CometExecRule (PR1.5) merges these with its built-in allExecs. CometExtensionRegistry.scala ServiceLoader-backed process-wide singleton. Idempotent `load()` that discovers contrib JARs on the classpath via standard META-INF/services entries. Failures to instantiate individual extensions are logged but never fatal -- one broken contrib JAR doesn't take down the Spark session. Test-only resetForTesting() hook. No callers yet; PR1.5 wires CometScanRule and CometExecRule to consult the registry, and PR1.6 wires CometSparkSessionExtensions to call load() during installation. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../comet/spi/CometExtensionRegistry.scala | 107 ++++++++++++++++++ .../spi/CometOperatorSerdeExtension.scala | 53 +++++++++ .../comet/spi/CometScanRuleExtension.scala | 86 ++++++++++++++ 3 files changed, 246 insertions(+) create mode 100644 spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala create mode 100644 spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala create mode 100644 spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala new file mode 100644 index 0000000000..be74571e64 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.spi + +import java.util.ServiceLoader +import java.util.concurrent.atomic.AtomicBoolean + +import scala.jdk.CollectionConverters._ + +import org.apache.spark.internal.Logging + +/** + * Process-wide singleton that discovers and exposes contrib extensions found on the + * classpath via `java.util.ServiceLoader`. + * + * Discovery happens once per JVM, idempotent: the first `load()` call enumerates every + * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` and + * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource on the + * Comet classloader. Subsequent calls are no-ops. + * + * `CometSparkSessionExtensions.apply` calls `load()` during Comet extension installation + * (PR1.6) so contrib JARs are picked up automatically when present. + * + * Failures to instantiate individual extensions are logged but do NOT fail Comet + * startup -- a misconfigured contrib JAR shouldn't take down the whole Spark session. + */ +object CometExtensionRegistry extends Logging { + + private val loaded = new AtomicBoolean(false) + @volatile private var scanExts: Seq[CometScanRuleExtension] = Seq.empty + @volatile private var serdeExts: Seq[CometOperatorSerdeExtension] = Seq.empty + + /** + * Discover contrib extensions on the classpath. Idempotent. Safe to call from multiple + * threads (only the first call performs discovery). + */ + def load(): Unit = { + if (loaded.compareAndSet(false, true)) { + scanExts = loadOne[CometScanRuleExtension]("CometScanRuleExtension") + serdeExts = loadOne[CometOperatorSerdeExtension]("CometOperatorSerdeExtension") + if (scanExts.nonEmpty || serdeExts.nonEmpty) { + logInfo( + s"Comet contrib extensions loaded: " + + s"scan=[${scanExts.map(_.name).mkString(", ")}], " + + s"serde=[${serdeExts.map(_.name).mkString(", ")}]") + } + } + } + + /** Registered scan-rule extensions, in classpath discovery order. */ + def scanExtensions: Seq[CometScanRuleExtension] = scanExts + + /** Registered operator-serde extensions, in classpath discovery order. */ + def serdeExtensions: Seq[CometOperatorSerdeExtension] = serdeExts + + /** + * Test-only: reset the registry to the empty state. Lets unit tests re-run discovery + * with a different classpath / overridden services. Not for production use. + */ + private[comet] def resetForTesting(): Unit = { + loaded.set(false) + scanExts = Seq.empty + serdeExts = Seq.empty + } + + private def loadOne[T](label: String)(implicit ct: scala.reflect.ClassTag[T]): Seq[T] = { + val cls = ct.runtimeClass.asInstanceOf[Class[T]] + val loader = Option(Thread.currentThread().getContextClassLoader) + .getOrElse(getClass.getClassLoader) + try { + val it = ServiceLoader.load(cls, loader).iterator().asScala + val out = scala.collection.mutable.ArrayBuffer.empty[T] + while (it.hasNext) { + // Pull each extension under its own try so one broken contrib doesn't sink the + // rest of the registry. ServiceLoader.next() can throw if the extension fails to + // instantiate (missing class, ctor exception, etc.). + try out += it.next() + catch { + case scala.util.control.NonFatal(e) => + logWarning(s"Failed to load a $label entry; skipping: ${e.getMessage}", e) + } + } + out.toSeq + } catch { + case scala.util.control.NonFatal(e) => + logWarning(s"$label discovery failed; no contrib extensions of this kind loaded", e) + Seq.empty + } + } +} diff --git a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala new file mode 100644 index 0000000000..dc56ccbdce --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.spi + +import org.apache.spark.sql.execution.SparkPlan + +import org.apache.comet.serde.CometOperatorSerde + +/** + * SPI hook that lets a contrib extension contribute additional operator-to-native serdes to + * `CometExecRule`. Used when a contrib needs to translate a contrib-specific physical + * operator (e.g. `CometDeltaNativeScanExec` for Delta) into a native plan -- the contrib + * provides the serde, and `CometExecRule` calls it during plan transformation. + * + * `CometExecRule` discovers implementations via `CometExtensionRegistry.serdeExtensions` + * (ServiceLoader-backed). Each contrib JAR ships a + * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource listing + * its extension class. + * + * Implementations MUST be stateless / safe to share across query executions. + */ +trait CometOperatorSerdeExtension { + + /** Human-readable name shown in logs and error messages. */ + def name: String + + /** + * Mapping of SparkPlan class -> serde. The contrib lists every operator class it knows + * how to translate to native. `CometExecRule` merges these mappings with its built-in + * `allExecs` to dispatch by class identity at conversion time. + * + * Convention: each contrib's mapping should reference only classes the contrib itself + * defines, so two contribs never claim ownership of the same operator class. + */ + def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] +} diff --git a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala new file mode 100644 index 0000000000..9789378878 --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.spi + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan} +import org.apache.spark.sql.execution.datasources.HadoopFsRelation +import org.apache.spark.sql.execution.datasources.v2.BatchScanExec + +/** + * SPI hook that lets a contrib extension intercept scan transformation in + * `CometScanRule`. Contribs typically use this to recognise a specific table format (Delta, + * Hudi, etc.) and route it through a contrib-specific native execution path. + * + * `CometScanRule` discovers implementations via `CometExtensionRegistry.scanExtensions` + * (ServiceLoader-backed) and offers each candidate scan to every registered extension in + * registration order. The first extension whose [[matches]] returns `true` wins -- its + * [[transformV1]] / [[transformV2]] is called and the returned plan replaces the scan + * branch. If no extension matches, the core's existing file-format dispatch handles the + * scan as before. + * + * Contribs are discovered via the standard Java ServiceLoader. Each contrib JAR ships a + * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` resource listing its + * extension class. + * + * Implementations MUST be safe to invoke from `CometScanRule`'s `apply` method -- + * specifically: pure, stateless, side-effect-free with respect to the plan tree (any state + * needed should be derived from `scanExec` / `relation` / the surrounding plan). The + * registry caches instances across plans, so per-plan state on the implementation will + * leak between queries. + */ +trait CometScanRuleExtension { + + /** Human-readable name shown in logs and error messages. Should be unique per extension. */ + def name: String + + /** + * Whether this extension wants to handle the given V1 scan. Implementations should make a + * cheap decision here (typically file-format class-name probe) so non-matching paths add + * no per-scan overhead. + * + * Default returns false; override `matchesV1` and `transformV1` for V1 scan support. + */ + def matchesV1(relation: HadoopFsRelation): Boolean = false + + /** + * Transform the matched V1 scan. Called only when `matchesV1` returned true. + * + * Returning `None` means "I matched but ultimately can't accelerate this one" -- the + * core falls back to its existing file-format dispatch. Returning `Some(plan)` replaces + * the scan subtree. + */ + def transformV1( + plan: SparkPlan, + scanExec: FileSourceScanExec, + session: SparkSession): Option[SparkPlan] = None + + /** + * Whether this extension wants to handle the given V2 batch scan. See `matchesV1`. + * + * Default returns false; override `matchesV2` and `transformV2` for V2 scan support. + */ + def matchesV2(scanExec: BatchScanExec): Boolean = false + + /** + * Transform the matched V2 scan. Called only when `matchesV2` returned true. + */ + def transformV2(scanExec: BatchScanExec, session: SparkSession): Option[SparkPlan] = None +} From 42234b9648c377a49d0f22225a5e30f0f73fa521 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Wed, 13 May 2026 22:27:17 -0400 Subject: [PATCH 04/27] feat(contrib): wire CometScanRule + CometExecRule to SPI registry PR1.5 from docs/contrib-delta-migration-plan.md. Three integration hooks added; all are no-ops until contribs are present on the classpath and PR1.6 calls CometExtensionRegistry.load() during extension install. CometScanRule.transformV1Scan After the Spark 3.4 AQE-DPP gate, iterate CometExtensionRegistry.scanExtensions. First extension whose `matchesV1` returns true gets `transformV1` called. Some result replaces the scan; None falls through to core's existing file-format dispatch. CometScanRule.transformV2Scan Same shape for BatchScanExec via matchesV2/transformV2. CometExecRule.transform.convertNode (operator dispatch) When a non-shuffle, non-broadcast operator has all-native children, the lookup now consults `(allExecs ++ contribSerdes)` where `contribSerdes` is the union of every registered CometOperatorSerdeExtension's `serdes` map. Contrib operator classes (e.g. a future Delta-contrib's CometDeltaNativeScanExec) get matched here without core having to know about them. Iteration order is registration order (i.e. ServiceLoader discovery order, which is classpath-stable per JVM run). Contribs that need priority should be the first META-INF/services entry on the classpath; in practice contribs claim disjoint scan types so the ordering rarely matters. No regression risk: with no extensions loaded (the state on this branch and on main today), every hook short-circuits in O(1) and falls through to the existing code path. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/comet/rules/CometExecRule.scala | 11 ++++++- .../apache/comet/rules/CometScanRule.scala | 33 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index 72c2bea9e4..a1d324065f 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -54,6 +54,7 @@ import org.apache.comet.rules.CometExecRule.allExecs import org.apache.comet.serde._ import org.apache.comet.serde.operator._ import org.apache.comet.shims.{ShimCometStreaming, ShimSubqueryBroadcast} +import org.apache.comet.spi.CometExtensionRegistry object CometExecRule { @@ -349,7 +350,15 @@ case class CometExecRule(session: SparkSession) // if all children are native (or if this is a leaf node) then see if there is a // registered handler for creating a fully native plan if (op.children.forall(_.isInstanceOf[CometNativeExec])) { - val handler = allExecs + // Contrib SPI: each registered CometOperatorSerdeExtension contributes a + // SparkPlan-class -> CometOperatorSerde map. We merge those over `allExecs` + // here so contrib operators (e.g. a future CometDeltaNativeScanExec from a + // delta contrib) get dispatched the same way built-in operators do. Contribs + // own classes that aren't in `allExecs`, so this merge never overrides a core + // mapping in practice. + val contribSerdes = + CometExtensionRegistry.serdeExtensions.flatMap(_.serdes).toMap + val handler = (allExecs ++ contribSerdes) .get(op.getClass) .map(_.asInstanceOf[CometOperatorSerde[SparkPlan]]) handler match { diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 64b69be1e9..b5c70b7451 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -50,6 +50,7 @@ import org.apache.comet.parquet.CometParquetUtils.{encryptionEnabled, isEncrypti import org.apache.comet.parquet.Native import org.apache.comet.serde.operator.{CometIcebergNativeScan, CometNativeScan} import org.apache.comet.shims.{CometTypeShim, ShimCometStreaming, ShimFileFormat, ShimSubqueryBroadcast} +import org.apache.comet.spi.CometExtensionRegistry /** * Spark physical optimizer rule for replacing Spark scans with Comet scans. @@ -161,6 +162,26 @@ case class CometScanRule(session: SparkSession) return withInfo(scanExec, "AQE Dynamic Partition Pruning requires Spark 3.5+") } + // Contrib SPI dispatch: offer the scan to every registered CometScanRuleExtension + // before core's built-in file-format logic. The first extension whose `matchesV1` + // returns true gets `transformV1` called -- if that returns Some, the result replaces + // the scan branch entirely. Returning None means "I matched but ultimately can't + // accelerate this one", and core's existing logic handles it. Iterating in + // registration order makes contrib selection deterministic. + scanExec.relation match { + case r: HadoopFsRelation => + val matched = CometExtensionRegistry.scanExtensions.find(_.matchesV1(r)) + matched match { + case Some(ext) => + ext.transformV1(plan, scanExec, session) match { + case Some(replacement) => return replacement + case None => // extension matched but declined; fall through + } + case None => // no extension matched; fall through + } + case _ => // SPI only operates on HadoopFsRelation V1 scans + } + scanExec.relation match { case r: HadoopFsRelation => if (!CometScanExec.isFileFormatSupported(r.fileFormat)) { @@ -259,6 +280,18 @@ case class CometScanRule(session: SparkSession) private def transformV2Scan(scanExec: BatchScanExec): SparkPlan = { + // Contrib SPI dispatch (V2): same shape as transformV1Scan above. First matching + // extension wins; None return falls through to core's logic. + val matched = CometExtensionRegistry.scanExtensions.find(_.matchesV2(scanExec)) + matched match { + case Some(ext) => + ext.transformV2(scanExec, session) match { + case Some(replacement) => return replacement + case None => // extension matched but declined; fall through + } + case None => // no extension matched; fall through + } + scanExec.scan match { case scan: CSVScan if COMET_CSV_V2_NATIVE_ENABLED.get() => val fallbackReasons = new ListBuffer[String]() From 8b69471520372c8fd8f71f9dc223f3bb9213df24 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Wed, 13 May 2026 22:28:57 -0400 Subject: [PATCH 05/27] feat(contrib): call CometExtensionRegistry.load() at extension install PR1.6 from docs/contrib-delta-migration-plan.md. Adds a single call to CometExtensionRegistry.load() at the top of CometSparkSessionExtensions.apply, before any of Comet's rules are registered. Discovery happens once per JVM (idempotent), so subsequent SparkSession installs are no-ops. With no contrib JARs on the classpath the call discovers nothing and returns; with contribs present, their META-INF/services entries are enumerated and the registered extensions become visible to CometScanRule (PR1.5) and CometExecRule (PR1.5). Closes the JVM half of the contrib SPI: every PR1 piece for the JVM side is now in place. Remaining PR1 deliverables are the contrib/example/ minimum example (PR1.7) and the contributor guide (PR1.8). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scala/org/apache/comet/CometSparkSessionExtensions.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala index 679005d9b1..469fc0b409 100644 --- a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala +++ b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala @@ -87,6 +87,11 @@ class CometSparkSessionExtensions with Logging with ShimCometSparkSessionExtensions { override def apply(extensions: SparkSessionExtensions): Unit = { + // Discover contrib extensions on the classpath BEFORE registering our rules so that + // CometScanRule / CometExecRule see the contribs the first time they run. Idempotent + // and safe to call multiple times across SparkSession instances within the same JVM. + org.apache.comet.spi.CometExtensionRegistry.load() + extensions.injectColumnar { session => CometScanColumnar(session) } extensions.injectColumnar { session => CometExecColumnar(session) } // Pre-3.5 only: tag AQE DPP regions so the conversion rules below leave them Spark-native. From d1553b558395dfef6874a839e7be70132034cfaa Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Wed, 13 May 2026 23:39:44 -0400 Subject: [PATCH 06/27] feat(contrib): SPI crate split + worked example contrib (PR1.7 part 1) Substantive piece of PR1.7. Two structural changes break what was otherwise going to be a cyclic dependency between core and contribs: 1. New crate native/contrib-spi/ (`comet-contrib-spi`): - Defines the `ContribOperatorPlanner` trait that contribs implement. - Owns the process-wide registry (`register_contrib_planner`, `lookup_contrib_planner_by_kind`, `registered_contrib_kinds`). - Light-weight `ContribError` enum for SPI errors. Core converts to its own `ExecutionError` at the dispatch site. - 1 unit test covering registration round-trip. - Only deps: `datafusion` + `log`. No deps on core, no deps from core back. The SPI is the leaf both core and contribs depend on. 2. New crate contrib/example/native/ (`comet-contrib-example`): - rlib (NOT cdylib) -- linked INTO core's libcomet via the `contrib-example` Cargo feature flag on core. - Registers a `NoOpPlanner` against kind `"example-no-op"` via `#[ctor::ctor]`. The planner returns a sentinel error so tests can verify the full JVM->JNI->native->contrib dispatch chain. - Depends on `comet-contrib-spi` (NOT on core). - Real contribs follow the same shape: rlib + #[ctor] + thin dependency on the SPI crate. Core rewiring: - native/core/Cargo.toml: `comet-contrib-spi` mandatory dep; `comet-contrib-example` optional dep gated by feature `contrib-example` (default-on so released builds ship the example registered). - native/core/src/lib.rs: `extern crate comet_contrib_example` gated by the feature so #[ctor] runs at libcomet load. - native/core/src/execution/planner/contrib.rs: now just re-exports the SPI surface for backwards-compatible imports within core. - native/core/src/execution/planner.rs: ContribOp dispatcher now recursively builds native children, calls the SPI's `plan(payload, children) -> Arc`, wraps in `SparkPlan`. Maps `ContribError` to `ExecutionError::GeneralError` with a clear contrib-identified prefix. Workspace wiring: - native/Cargo.toml: adds `contrib-spi` to default-members so the SPI crate is built/checked with the rest of the workspace. - Adds `../contrib/example/native` to workspace members (NOT default-members) so it shares the workspace lockfile and dependency overrides but isn't compiled standalone. Build state: `cargo check` on all three crates (core, contrib-spi, contrib-example) is clean. SPI unit test passes. The Maven side of the example contrib (pom.xml, Scala extension, ServiceLoader entry, integration test) is NOT in this commit -- it lands in a follow-up on the same branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/native/Cargo.toml | 46 +++++ contrib/example/native/src/lib.rs | 72 ++++++++ native/Cargo.lock | 59 ++++++- native/Cargo.toml | 10 +- native/contrib-spi/Cargo.toml | 30 ++++ native/contrib-spi/src/lib.rs | 169 +++++++++++++++++++ native/core/Cargo.toml | 17 +- native/core/src/execution/planner.rs | 33 +++- native/core/src/execution/planner/contrib.rs | 139 ++------------- native/core/src/lib.rs | 7 + 10 files changed, 446 insertions(+), 136 deletions(-) create mode 100644 contrib/example/native/Cargo.toml create mode 100644 contrib/example/native/src/lib.rs create mode 100644 native/contrib-spi/Cargo.toml create mode 100644 native/contrib-spi/src/lib.rs diff --git a/contrib/example/native/Cargo.toml b/contrib/example/native/Cargo.toml new file mode 100644 index 0000000000..b86728deb2 --- /dev/null +++ b/contrib/example/native/Cargo.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "comet-contrib-example" +description = "Worked reference implementation of a Comet contrib extension. Registers a no-op ContribOperatorPlanner under kind=\"example-no-op\" so the SPI dispatch path can be exercised end-to-end in tests." +# Contrib crates live OUTSIDE the workspace root directory (`native/`) but are listed as +# workspace members in `native/Cargo.toml`. Cargo's auto-discovery walks up the directory +# tree, so without the explicit pointer it can't find `native/Cargo.toml` from +# `contrib/example/native/`. +workspace = "../../../native" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +edition = { workspace = true } + +[lib] +# rlib (not cdylib): linked INTO core's cdylib via the `contrib-example` Cargo feature +# flag on the core crate. There is exactly one libcomet.{so,dylib,dll} at runtime; the +# contrib's #[ctor] runs during that single library's init. +crate-type = ["rlib"] + +[dependencies] +# Depend on the thin SPI crate, NOT on core. This is what breaks the cycle: core +# depends on contribs (Cargo feature → rlib link); both depend on contrib-spi; nothing +# depends back on core from a contrib. +comet-contrib-spi = { path = "../../../native/contrib-spi" } +datafusion = { workspace = true } +ctor = "0.4" +log = "0.4" diff --git a/contrib/example/native/src/lib.rs b/contrib/example/native/src/lib.rs new file mode 100644 index 0000000000..8857deff59 --- /dev/null +++ b/contrib/example/native/src/lib.rs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Worked reference implementation of a Comet contrib extension. +//! +//! Registers a single `ContribOperatorPlanner` under `kind = "example-no-op"`. The +//! planner is intentionally trivial: it returns a clear `ContribError::Plan` so tests can +//! verify the full dispatch chain (JVM serde → ContribOp envelope → JNI → native planner +//! → contrib registry → this planner) without needing to actually execute anything. +//! +//! Real contribs (Delta, Hudi, etc.) replace `NoOpPlanner::plan` with a function that +//! decodes the contrib's own proto message from `payload` and constructs an +//! `ExecutionPlan` for the contrib's native operator. +//! +//! The whole crate is gated by `native/core/Cargo.toml`'s `contrib-example` feature flag. +//! Build core without that feature (`cargo build --no-default-features`) and zero bytes +//! of this crate end up in `libcomet`. + +use std::sync::Arc; + +use comet_contrib_spi::{ + register_contrib_planner, ContribError, ContribOperatorPlanner, +}; +use datafusion::physical_plan::ExecutionPlan; + +/// Stable identifier the example registers under. The Scala side writes this same string +/// into `ContribOp.kind` when building a payload for the example operator. Convention: +/// `-`. +pub const EXAMPLE_NO_OP_KIND: &str = "example-no-op"; + +/// A planner that intentionally does no plan-building work. It exists only to prove the +/// dispatch chain is wired up correctly: tests construct an Operator with this kind, ship +/// it through JNI, and assert that the returned error mentions this string. +struct NoOpPlanner; + +impl ContribOperatorPlanner for NoOpPlanner { + fn plan( + &self, + _payload: &[u8], + _children: Vec>, + ) -> Result, ContribError> { + Err(ContribError::Plan(format!( + "comet-contrib-example: NoOpPlanner reached for kind={EXAMPLE_NO_OP_KIND:?}; \ + this is the expected sentinel for SPI dispatch tests" + ))) + } +} + +/// Registers `NoOpPlanner` against `EXAMPLE_NO_OP_KIND` at library-init time. Called by +/// the linker before `main`/`JNI_OnLoad` because of `#[ctor::ctor]`. Comet's main +/// `libcomet` is what gets loaded by the JVM; this constructor runs during its init. +#[ctor::ctor] +fn register() { + log::info!( + "comet-contrib-example: registering ContribOperatorPlanner kind={EXAMPLE_NO_OP_KIND:?}" + ); + register_contrib_planner(EXAMPLE_NO_OP_KIND, Arc::new(NoOpPlanner)); +} diff --git a/native/Cargo.lock b/native/Cargo.lock index df3c3b03c0..f13c22f1a9 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1485,6 +1485,24 @@ dependencies = [ "memchr", ] +[[package]] +name = "comet-contrib-example" +version = "0.17.0" +dependencies = [ + "comet-contrib-spi", + "ctor 0.4.3", + "datafusion", + "log", +] + +[[package]] +name = "comet-contrib-spi" +version = "0.17.0" +dependencies = [ + "datafusion", + "log", +] + [[package]] name = "comfy-table" version = "7.2.2" @@ -1740,16 +1758,32 @@ dependencies = [ "memchr", ] +[[package]] +name = "ctor" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec09e802f5081de6157da9a75701d6c713d8dc3ba52571fd4bd25f412644e8a6" +dependencies = [ + "ctor-proc-macro 0.0.6", + "dtor 0.0.6", +] + [[package]] name = "ctor" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e" dependencies = [ - "ctor-proc-macro", - "dtor", + "ctor-proc-macro 0.0.7", + "dtor 0.1.1", ] +[[package]] +name = "ctor-proc-macro" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2931af7e13dc045d8e9d26afccc6fa115d64e115c9c84b1166288b46f6782c2" + [[package]] name = "ctor-proc-macro" version = "0.0.7" @@ -1966,6 +2000,8 @@ dependencies = [ "aws-config", "aws-credential-types", "bytes", + "comet-contrib-example", + "comet-contrib-spi", "criterion", "datafusion", "datafusion-comet-common", @@ -2852,15 +2888,30 @@ dependencies = [ "const-random", ] +[[package]] +name = "dtor" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97cbdf2ad6846025e8e25df05171abfb30e3ababa12ee0a0e44b9bbe570633a8" +dependencies = [ + "dtor-proc-macro 0.0.5", +] + [[package]] name = "dtor" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301" dependencies = [ - "dtor-proc-macro", + "dtor-proc-macro 0.0.6", ] +[[package]] +name = "dtor-proc-macro" +version = "0.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7454e41ff9012c00d53cf7f475c5e3afa3b91b7c90568495495e8d9bf47a1055" + [[package]] name = "dtor-proc-macro" version = "0.0.6" @@ -4633,7 +4684,7 @@ version = "0.56.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b31d3d8e99a85d83b73ec26647f5607b80578ed9375810b6e44ffa3590a236" dependencies = [ - "ctor", + "ctor 0.6.3", "opendal-core", "opendal-layer-concurrent-limit", "opendal-layer-logging", diff --git a/native/Cargo.toml b/native/Cargo.toml index d1b5c74af9..9c5f5816bd 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -16,8 +16,14 @@ # under the License. [workspace] -default-members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle"] -members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "hdfs", "fs-hdfs"] +default-members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "contrib-spi"] +# `contrib-spi` is the thin SPI surface that BOTH core and contribs depend on -- breaking +# what would otherwise be a cyclic dep between core (links contribs via Cargo features) +# and contribs (need core types). Contrib crates themselves live under +# `../contrib//native` and are workspace members so workspace lockfile + workspace +# dependencies apply; they're NOT default-members because they're consumed via core's +# optional Cargo feature flags rather than built standalone. +members = ["core", "spark-expr", "common", "proto", "jni-bridge", "shuffle", "hdfs", "fs-hdfs", "contrib-spi", "../contrib/example/native"] resolver = "2" [workspace.package] diff --git a/native/contrib-spi/Cargo.toml b/native/contrib-spi/Cargo.toml new file mode 100644 index 0000000000..eea4855cd8 --- /dev/null +++ b/native/contrib-spi/Cargo.toml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "comet-contrib-spi" +description = "Stable SPI surface that contrib crates and Comet's core both depend on. Defines the ContribOperatorPlanner trait, the process-wide registry, and the lightweight error type. Separating this from the core crate breaks what would otherwise be a cyclic dependency (core links contribs via Cargo feature flags; contribs need core types)." +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +edition = { workspace = true } + +[dependencies] +datafusion = { workspace = true } +log = "0.4" diff --git a/native/contrib-spi/src/lib.rs b/native/contrib-spi/src/lib.rs new file mode 100644 index 0000000000..89b6471054 --- /dev/null +++ b/native/contrib-spi/src/lib.rs @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Thin SPI crate shared between Comet's core and every contrib crate. +//! +//! Both core (`datafusion-comet`) and individual contribs (`comet-contrib-example`, +//! eventually `comet-contrib-delta`) depend on THIS crate, NOT on each other. This avoids +//! a cyclic dependency: core wires contribs in via Cargo feature flags, and contribs need +//! the SPI types to implement the trait. With the SPI in a third crate, the dependency +//! graph is a DAG. +//! +//! Surface: +//! * [`ContribOperatorPlanner`] — the trait contribs implement. +//! * [`register_contrib_planner`] / [`lookup_contrib_planner_by_kind`] — +//! process-wide registry, expected to be populated from a contrib's `#[ctor]`. +//! * [`registered_contrib_kinds`] — diagnostics. + +use std::{ + collections::HashMap, + sync::{Arc, OnceLock, RwLock}, +}; + +use datafusion::physical_plan::ExecutionPlan; + +/// Implemented by each contrib. Called from core's planner when an `OpStruct::ContribOp` +/// with the contrib's `kind` is encountered. +/// +/// The contract is intentionally minimal: +/// * `payload` is the raw bytes from `ContribOp.payload`. The contrib decodes it into +/// whatever proto / serde format it uses internally; core never inspects. +/// * `children` is the list of already-built native children (in spark-plan child +/// order). The contrib uses these to build its `ExecutionPlan` if it needs child +/// inputs. +/// * The returned `Arc` is the contrib's operator. Core wraps it +/// into a `SparkPlan` and threads it through the rest of the plan tree. +/// +/// Implementations MUST be `Send + Sync` and idempotent — the same `(payload, children)` +/// must always produce a functionally equivalent plan, so core can cache or re-plan. +pub trait ContribOperatorPlanner: Send + Sync { + fn plan( + &self, + payload: &[u8], + children: Vec>, + ) -> Result, ContribError>; +} + +/// Error type returned by [`ContribOperatorPlanner::plan`]. Kept distinct from core's +/// `ExecutionError` so this crate stays free of core's dependency tree. Core converts +/// `ContribError` into its own `ExecutionError` at the dispatch site. +#[derive(Debug)] +pub enum ContribError { + /// Generic failure. Use this for cases that don't fit the more specific variants. + Plan(String), + /// The contrib received a payload it couldn't decode (wrong proto schema, missing + /// required field, etc.). + BadPayload(String), + /// The contrib received a child count it can't handle (e.g. a binary operator wired + /// to one child). + WrongChildCount { + expected: &'static str, + actual: usize, + }, +} + +impl std::fmt::Display for ContribError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ContribError::Plan(msg) => write!(f, "{msg}"), + ContribError::BadPayload(msg) => write!(f, "bad payload: {msg}"), + ContribError::WrongChildCount { expected, actual } => { + write!(f, "wrong child count: expected {expected}, got {actual}") + } + } + } +} + +impl std::error::Error for ContribError {} + +/// Process-wide registry of contrib operator planners, keyed by `ContribOp.kind`. +fn registry() -> &'static RwLock>> { + static REGISTRY: OnceLock>>> = + OnceLock::new(); + REGISTRY.get_or_init(|| RwLock::new(HashMap::new())) +} + +/// Register a contrib operator planner under the given `kind` identifier. Last-write-wins +/// on duplicates (logged as a warning). Thread-safe; intended to be called from a +/// contrib's `#[ctor]` constructor at library-init time. +pub fn register_contrib_planner( + kind: impl Into, + planner: Arc, +) { + let kind = kind.into(); + let mut guard = registry() + .write() + .expect("contrib planner registry poisoned"); + if guard.contains_key(&kind) { + log::warn!( + "register_contrib_planner: replacing existing planner for kind={kind:?}; \ + second registration usually indicates a misconfigured test harness" + ); + } + guard.insert(kind, planner); +} + +/// Look up the contrib planner registered for `kind`, or `None` if no contrib is loaded +/// for that operator. Core's dispatcher uses this to route `OpStruct::ContribOp` payloads. +pub fn lookup_contrib_planner_by_kind(kind: &str) -> Option> { + let guard = registry() + .read() + .expect("contrib planner registry poisoned"); + guard.get(kind).cloned() +} + +/// Return a snapshot of all registered contrib kinds, for diagnostics and tests. +pub fn registered_contrib_kinds() -> Vec { + let guard = registry() + .read() + .expect("contrib planner registry poisoned"); + let mut kinds: Vec = guard.keys().cloned().collect(); + kinds.sort(); + kinds +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::physical_plan::empty::EmptyExec; + use std::sync::Arc; + + struct AlwaysEmpty; + impl ContribOperatorPlanner for AlwaysEmpty { + fn plan( + &self, + _payload: &[u8], + _children: Vec>, + ) -> Result, ContribError> { + Ok(Arc::new(EmptyExec::new(Arc::new( + datafusion::arrow::datatypes::Schema::empty(), + )))) + } + } + + #[test] + fn register_and_lookup() { + register_contrib_planner("test-spi-kind-a", Arc::new(AlwaysEmpty)); + register_contrib_planner("test-spi-kind-b", Arc::new(AlwaysEmpty)); + assert!(lookup_contrib_planner_by_kind("test-spi-kind-a").is_some()); + assert!(lookup_contrib_planner_by_kind("test-spi-kind-b").is_some()); + assert!(lookup_contrib_planner_by_kind("test-spi-kind-c").is_none()); + let kinds = registered_contrib_kinds(); + assert!(kinds.contains(&"test-spi-kind-a".to_string())); + assert!(kinds.contains(&"test-spi-kind-b".to_string())); + } +} diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index 4fb3ed4c5d..b1bb2d30b7 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -73,6 +73,11 @@ reqwest = { version = "0.12", default-features = false, features = ["rustls-tls- object_store_opendal = { version = "0.56.0", optional = true } hdfs-sys = {version = "0.3", optional = true, features = ["hdfs_3_3"]} opendal = { version = "0.56.0", optional = true, features = ["services-hdfs"] } +# Contrib rlibs. Each is gated by a matching `contrib-` Cargo feature defined +# below in [features]. When the feature is on, the contrib's rlib is linked into core's +# cdylib and its #[ctor] runs at library load. +comet-contrib-spi = { path = "../contrib-spi" } +comet-contrib-example = { path = "../../contrib/example/native", optional = true } iceberg = { workspace = true } iceberg-storage-opendal = { workspace = true } serde_json = "1.0" @@ -95,11 +100,21 @@ datafusion-functions-nested = { version = "53.1.0" } [features] backtrace = ["datafusion/backtrace"] -default = ["hdfs-opendal"] +# `contrib-example` is on by default so released builds ship the example contrib's +# planner registered, and the worked-reference test in contrib/example exercises it. +# `cargo build --no-default-features` produces a cdylib with zero contrib code. +default = ["hdfs-opendal", "contrib-example"] hdfs = ["datafusion-comet-objectstore-hdfs"] hdfs-opendal = ["opendal", "object_store_opendal", "hdfs-sys"] jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"] +# Contrib feature flags. Each flag pulls a contrib rlib into core's cdylib so contrib +# Rust code is linked into the single libcomet at build time; the contrib's #[ctor] +# registers its operator planners during library init. See +# docs/contrib-delta-migration-plan.md for the architectural rationale (single cdylib +# instead of separate cdylib per contrib). +contrib-example = ["dep:comet-contrib-example"] + # exclude optional packages from cargo machete verifications [package.metadata.cargo-machete] ignored = ["hdfs-sys", "paste"] diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 33c262a25b..445749ae4e 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1964,10 +1964,9 @@ impl PhysicalPlanner { // Dispatch the ContribOp envelope to a contrib-registered planner keyed // by `kind`. The contrib's #[ctor] in its rlib (linked into core's cdylib // via a Cargo feature flag) populates the registry at lib-init time, so - // by the time we reach this arm the registry is already warm. If no - // planner is registered for this kind, surface a clear error -- typically - // means the contrib's JVM JAR is on the classpath but core was built - // without the corresponding `contrib-` Cargo feature. + // by the time we reach this arm the registry is already warm. Missing + // registrations typically mean the JVM JAR is on the classpath but core + // was built without the corresponding `contrib-` Cargo feature. use crate::execution::planner::contrib::lookup_contrib_planner_by_kind; let kind = contrib_op.kind.as_str(); let planner = lookup_contrib_planner_by_kind(kind).ok_or_else(|| { @@ -1977,7 +1976,31 @@ impl PhysicalPlanner { Cargo feature (or its workspace equivalent)?" )) })?; - planner.build(spark_plan, inputs, partition_count, self) + + // Recursively build native children. The contrib gets them as + // `Arc` rather than the richer `SparkPlan` because the + // SPI is intentionally minimal — contribs only need the DataFusion-level + // plan surface. + let mut child_scans: Vec = Vec::new(); + let mut child_shuffle_scans: Vec = Vec::new(); + let mut native_children: Vec> = Vec::new(); + for child in &spark_plan.children { + let (mut s, mut ss, child_plan) = + self.create_plan(child, inputs, partition_count)?; + child_scans.append(&mut s); + child_shuffle_scans.append(&mut ss); + native_children.push(child_plan.native_plan.clone()); + } + + let exec = planner + .plan(&contrib_op.payload, native_children) + .map_err(|e| GeneralError(format!("contrib planner {kind:?}: {e}")))?; + + Ok(( + child_scans, + child_shuffle_scans, + Arc::new(SparkPlan::new(spark_plan.plan_id, exec, vec![])), + )) } _ => Err(GeneralError(format!( "Unsupported or unregistered operator type: {:?}", diff --git a/native/core/src/execution/planner/contrib.rs b/native/core/src/execution/planner/contrib.rs index f185446edb..b78d8b1d7a 100644 --- a/native/core/src/execution/planner/contrib.rs +++ b/native/core/src/execution/planner/contrib.rs @@ -15,129 +15,20 @@ // specific language governing permissions and limitations // under the License. -//! Registry for contrib operator planners. +//! Convenience re-exports of the contrib SPI surface. //! -//! Contribs are extension crates that ship Spark plan operators living outside core (Delta, -//! example, future Hudi/DeltaSharing, etc.). They link into core's cdylib as Cargo `rlib` -//! dependencies enabled via core's Cargo feature flags (e.g. `contrib-delta`, -//! `contrib-example`). At library-init time (typically via `#[ctor]` in the contrib crate), -//! each contrib calls [`register_contrib_planner`] with a stable `kind` string and an -//! [`OperatorBuilder`] implementation. Core's `OpStruct::ContribOp` dispatcher arm then -//! looks up the planner by `kind` and delegates plan construction to it. -//! -//! See `docs/contrib-delta-migration-plan.md` for the broader architecture. - -use std::{ - collections::HashMap, - sync::{Arc, OnceLock, RwLock}, +//! The actual trait + registry live in the standalone `comet-contrib-spi` crate so both +//! core and contribs can depend on them without forming a dependency cycle (core links +//! contribs via Cargo feature flags, contribs need the SPI types). This module just +//! re-exports the surface so existing `crate::execution::planner::contrib::...` +//! imports inside core continue to resolve. + +// Re-export the parts of the SPI core itself uses (the dispatcher only needs +// `lookup_contrib_planner_by_kind`). The other helpers — `register_contrib_planner`, +// `registered_contrib_kinds`, `ContribError`, `ContribOperatorPlanner` — are exposed +// directly from the `comet_contrib_spi` crate so contribs import them from there. +pub use comet_contrib_spi::lookup_contrib_planner_by_kind; +#[allow(unused_imports)] // surfaced for tests + diagnostics; consumed in PR1.7 onwards +pub use comet_contrib_spi::{ + register_contrib_planner, registered_contrib_kinds, ContribError, ContribOperatorPlanner, }; - -use super::operator_registry::OperatorBuilder; - -/// Process-wide registry of contrib operator planners, keyed by `ContribOp.kind`. -/// -/// Implemented as an `OnceLock>` so: -/// * The OnceLock makes lazy first-touch initialisation thread-safe. -/// * The inner RwLock allows multiple contribs to register concurrently at lib-init time -/// (e.g. independent `#[ctor]` invocations) without blocking subsequent reads. -/// -/// Registration is cheap and happens once per contrib per process; lookups are read-mostly. -fn registry() -> &'static RwLock>> { - static REGISTRY: OnceLock>>> = OnceLock::new(); - REGISTRY.get_or_init(|| RwLock::new(HashMap::new())) -} - -/// Register a contrib operator planner under the given `kind` identifier. -/// -/// `kind` must match the value the contrib's JVM-side serde writes into the -/// `ContribOp.kind` proto field. Convention: lowercase-hyphenated, prefixed by the -/// contrib's short name (e.g. `delta-scan`, `example-constant-scan`). -/// -/// If a planner is already registered for `kind`, this REPLACES it and logs a warning. -/// Last-write-wins lets test harnesses re-register without restarting the JVM, and -/// production contribs only ever register once per process. -/// -/// Thread-safe; intended to be called from a contrib's `#[ctor]` at library init. -pub fn register_contrib_planner(kind: impl Into, planner: Arc) { - let kind = kind.into(); - let mut guard = registry() - .write() - .expect("contrib planner registry poisoned"); - if guard.contains_key(&kind) { - log::warn!( - "register_contrib_planner: replacing existing planner for kind={kind:?}; \ - second registration usually indicates a misconfigured test harness" - ); - } - guard.insert(kind, planner); -} - -/// Look up the contrib planner registered for `kind`, or `None` if no contrib is loaded -/// for that operator. The native dispatcher arm in `planner.rs` uses this to route -/// `OpStruct::ContribOp` payloads. -pub fn lookup_contrib_planner_by_kind(kind: &str) -> Option> { - let guard = registry() - .read() - .expect("contrib planner registry poisoned"); - guard.get(kind).cloned() -} - -/// Return a snapshot of all registered contrib kinds. Useful for diagnostics and tests. -pub fn registered_contrib_kinds() -> Vec { - let guard = registry() - .read() - .expect("contrib planner registry poisoned"); - let mut kinds: Vec = guard.keys().cloned().collect(); - kinds.sort(); - kinds -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::execution::operators::{ExecutionError, ScanExec, ShuffleScanExec}; - use crate::execution::planner::PhysicalPlanner; - use crate::execution::spark_plan::SparkPlan; - use datafusion_comet_proto::spark_operator::Operator; - use jni::objects::{Global, JObject}; - - /// Trivial test planner that returns a not-implemented error. We don't need a real - /// ExecutionPlan to validate the registry; only identity-by-kind matters. - struct NoopBuilder(&'static str); - impl OperatorBuilder for NoopBuilder { - fn build( - &self, - _spark_plan: &Operator, - _inputs: &mut Vec>>>, - _partition_count: usize, - _planner: &PhysicalPlanner, - ) -> Result<(Vec, Vec, Arc), ExecutionError> { - Err(ExecutionError::GeneralError(format!( - "NoopBuilder({}) -- registry round-trip ok", - self.0 - ))) - } - } - - #[test] - fn register_and_lookup_round_trips_by_kind() { - register_contrib_planner("test-kind-a", Arc::new(NoopBuilder("a"))); - register_contrib_planner("test-kind-b", Arc::new(NoopBuilder("b"))); - - assert!(lookup_contrib_planner_by_kind("test-kind-a").is_some()); - assert!(lookup_contrib_planner_by_kind("test-kind-b").is_some()); - assert!(lookup_contrib_planner_by_kind("test-kind-c").is_none()); - - let kinds = registered_contrib_kinds(); - assert!(kinds.contains(&"test-kind-a".to_string())); - assert!(kinds.contains(&"test-kind-b".to_string())); - } - - #[test] - fn registering_existing_kind_replaces() { - register_contrib_planner("test-replace-kind", Arc::new(NoopBuilder("first"))); - // Second registration should not panic; replaces silently (with a warn-level log). - register_contrib_planner("test-replace-kind", Arc::new(NoopBuilder("second"))); - assert!(lookup_contrib_planner_by_kind("test-replace-kind").is_some()); - } -} diff --git a/native/core/src/lib.rs b/native/core/src/lib.rs index 7d0b6a5454..4d74a7f52f 100644 --- a/native/core/src/lib.rs +++ b/native/core/src/lib.rs @@ -29,6 +29,13 @@ extern crate core; #[macro_use] extern crate datafusion_comet_jni_bridge; +// Pull in contrib crates so their #[ctor] registration runs when libcomet is loaded. +// Each is gated by a Cargo feature flag (see `[features]` in core's Cargo.toml). With the +// feature off the `extern crate` line is removed by cfg and zero bytes of the contrib end +// up in the built cdylib. +#[cfg(feature = "contrib-example")] +extern crate comet_contrib_example; + use jni::{ objects::{JClass, JString}, EnvUnowned, From 5cb7099a868936619cb2b4217d9b6876020242d7 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 07:22:50 -0400 Subject: [PATCH 07/27] feat(contrib): JVM half of contrib/example reference module PR1.7 part 2 (completes PR1.7). The Rust half landed in d1553b55; this commit lands the Maven module + Scala extension + ServiceLoader entry so `mvn install` from the repo root produces a published comet-contrib-example-* JAR alongside core's comet-spark-*. New files: contrib/example/pom.xml Maven module. Inherits the parent pom; depends on comet-spark (provided scope, transitive Spark/Scala). Disables the parent pom's BanDuplicateClasses enforcer for this contrib because the parent rule was tuned for core (comet-spark shades scala-collection-compat and Spark drags in the same classes unshaded -- a per-module override is cleaner than reshaping the parent rule for every future contrib). contrib/example/src/main/scala/.../ExampleScanRuleExtension.scala Trivial CometScanRuleExtension impl. matchesV1 keys on a test-only marker option so the SPI can be exercised deterministically; matchesV2 / transformV1 / transformV2 inherit trait defaults. Real contribs replace these with their own file-format probes + native dispatch. contrib/example/src/main/resources/META-INF/services/ org.apache.comet.spi.CometScanRuleExtension ServiceLoader manifest entry. This is the single line that makes the contrib JVM-discoverable. contrib/example/src/test/scala/.../ExampleScanRuleExtensionSuite.scala Two tests: 1. ServiceLoader discovers ExampleScanRuleExtension via CometExtensionRegistry.load() with no other configuration. 2. matchesV1 honours the test marker option. Root pom.xml: Adds `contrib/example` to the modules list so `mvn install` from the repo root builds and installs the contrib alongside core. Build state: `mvn install -DskipTests -Pspark-3.5` builds the new module successfully. Native-side Rust artifact (rlib linked into libcomet via Cargo feature `contrib-example`) was already committed in d1553b55. PR1.7 closed. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/pom.xml | 126 ++++++++++++++++++ ...rg.apache.comet.spi.CometScanRuleExtension | 1 + .../example/ExampleScanRuleExtension.scala | 75 +++++++++++ .../ExampleScanRuleExtensionSuite.scala | 84 ++++++++++++ pom.xml | 8 ++ .../comet/spi/CometExtensionRegistry.scala | 24 ++-- .../spi/CometOperatorSerdeExtension.scala | 20 +-- .../comet/spi/CometScanRuleExtension.scala | 35 +++-- 8 files changed, 332 insertions(+), 41 deletions(-) create mode 100644 contrib/example/pom.xml create mode 100644 contrib/example/src/main/resources/META-INF/services/org.apache.comet.spi.CometScanRuleExtension create mode 100644 contrib/example/src/main/scala/org/apache/comet/contrib/example/ExampleScanRuleExtension.scala create mode 100644 contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala diff --git a/contrib/example/pom.xml b/contrib/example/pom.xml new file mode 100644 index 0000000000..99b8f3f12a --- /dev/null +++ b/contrib/example/pom.xml @@ -0,0 +1,126 @@ + + + + + + + 4.0.0 + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.17.0-SNAPSHOT + ../../pom.xml + + + + comet-contrib-example-spark${spark.version.short}_${scala.binary.version} + comet-contrib-example + + + + false + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + + + + no-duplicate-declared-dependencies + + enforce + + + true + + + + + + + + + + + org.apache.datafusion + comet-spark-spark${spark.version.short}_${scala.binary.version} + ${project.version} + provided + + + + + + org.apache.spark + spark-sql_${scala.binary.version} + test + + + org.scala-lang + scala-library + test + + + + org.scalatest + scalatest_${scala.binary.version} + test + + + org.scalatestplus + junit-4-13_${scala.binary.version} + test + + + diff --git a/contrib/example/src/main/resources/META-INF/services/org.apache.comet.spi.CometScanRuleExtension b/contrib/example/src/main/resources/META-INF/services/org.apache.comet.spi.CometScanRuleExtension new file mode 100644 index 0000000000..13c4689816 --- /dev/null +++ b/contrib/example/src/main/resources/META-INF/services/org.apache.comet.spi.CometScanRuleExtension @@ -0,0 +1 @@ +org.apache.comet.contrib.example.ExampleScanRuleExtension diff --git a/contrib/example/src/main/scala/org/apache/comet/contrib/example/ExampleScanRuleExtension.scala b/contrib/example/src/main/scala/org/apache/comet/contrib/example/ExampleScanRuleExtension.scala new file mode 100644 index 0000000000..6ef10587e0 --- /dev/null +++ b/contrib/example/src/main/scala/org/apache/comet/contrib/example/ExampleScanRuleExtension.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.example + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.execution.datasources.HadoopFsRelation + +import org.apache.comet.spi.CometScanRuleExtension + +/** + * Worked-reference `CometScanRuleExtension` for new contrib authors. This implementation is + * intentionally trivial: it does not match any scan and never transforms anything. What it proves + * end-to-end at runtime is: + * + * 1. `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` discovery works: + * `CometExtensionRegistry.load()` finds this class via ServiceLoader as soon as the contrib + * JAR is on the classpath. + * + * 2. The wiring in `CometScanRule.transformV1Scan` / `transformV2Scan` actually iterates + * extensions: even though this one returns `false` from `matchesV1` and `matchesV2`, the registry + * call happens for every scan. + * + * Real contribs replace `matchesV1` / `transformV1` with real probes against the scan's + * `relation.fileFormat` (e.g. Delta would detect `DeltaParquetFileFormat`) and `transformV1` with + * the contrib's native dispatch. + * + * The matching native-side counterpart lives in `contrib/example/native/src/lib.rs` -- it + * registers a `ContribOperatorPlanner` under the same kind string used by any future Scala-side + * serde this example might add. + */ +class ExampleScanRuleExtension extends CometScanRuleExtension with Logging { + override val name: String = "example" + + override def matchesV1(relation: HadoopFsRelation): Boolean = { + // Sentinel: only match if a synthetic option declares this contrib should claim the + // scan. Production contribs replace this with a real file-format probe; here we want + // the test to be able to opt in deterministically. + relation.options + .get(ExampleScanRuleExtension.MarkerOptionKey) + .contains(ExampleScanRuleExtension.MarkerOptionValue) + } + + // matchesV2 / transformV1 / transformV2 inherit the trait defaults (`false` / `None`). + // This example only demonstrates V1 discovery. A real contrib would override the + // transform methods to build its native plan. +} + +object ExampleScanRuleExtension { + + /** + * Test-only option key. A Spark read can set this on `HadoopFsRelation.options` to trigger + * `ExampleScanRuleExtension.matchesV1` and verify the SPI is being consulted. + */ + val MarkerOptionKey: String = "comet.contrib.example.marker" + + /** Sentinel value the marker option must equal. */ + val MarkerOptionValue: String = "match" +} diff --git a/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala b/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala new file mode 100644 index 0000000000..314acd9107 --- /dev/null +++ b/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.example + +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.comet.spi.CometExtensionRegistry + +/** + * Verifies the JVM half of the contrib SPI by going through public API only: + * + * * `CometExtensionRegistry.load()` discovers this contrib via its + * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` entry. * The discovered + * extension is the `ExampleScanRuleExtension` defined in this module. * `matchesV1` honours the + * test-only marker option so a real `CometScanRule.transformV1Scan` integration test could + * deterministically opt in. + * + * Native-side dispatch (the `OpStruct::ContribOp` arm in core's planner that delegates to the + * example's Rust `NoOpPlanner`) is exercised by core's own integration tests when built with the + * `contrib-example` Cargo feature on -- not duplicated here. + */ +class ExampleScanRuleExtensionSuite extends AnyFunSuite { + + test("CometExtensionRegistry discovers ExampleScanRuleExtension via ServiceLoader") { + // The registry caches discovery results across calls; reset so this test sees a + // deterministic load against the current test classpath. + CometExtensionRegistry.resetForTesting() + CometExtensionRegistry.load() + + val found = CometExtensionRegistry.scanExtensions.find(_.name == "example") + assert(found.isDefined, "ServiceLoader should have discovered the example contrib") + assert(found.get.isInstanceOf[ExampleScanRuleExtension]) + } + + test("matchesV1 returns true only when the marker option is set") { + val ext = new ExampleScanRuleExtension + + // We construct a minimal HadoopFsRelation just enough to call matchesV1. The trait + // method only reads `relation.options` so we don't need a real file format/schema. + val sparkSession = org.apache.spark.sql.SparkSession + .builder() + .master("local[1]") + .appName("ExampleScanRuleExtensionSuite") + .getOrCreate() + try { + val relationWithoutMarker = new org.apache.spark.sql.execution.datasources.HadoopFsRelation( + location = new org.apache.spark.sql.execution.datasources.InMemoryFileIndex( + sparkSession, + Seq.empty, + Map.empty, + None), + partitionSchema = new org.apache.spark.sql.types.StructType(), + dataSchema = new org.apache.spark.sql.types.StructType(), + bucketSpec = None, + fileFormat = new org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat(), + options = Map.empty)(sparkSession) + assert(!ext.matchesV1(relationWithoutMarker), "no marker -> no match") + + val relationWithMarker = relationWithoutMarker.copy(options = Map( + ExampleScanRuleExtension.MarkerOptionKey -> + ExampleScanRuleExtension.MarkerOptionValue))(sparkSession) + assert(ext.matchesV1(relationWithMarker), "marker present -> match") + } finally { + sparkSession.stop() + } + } +} diff --git a/pom.xml b/pom.xml index 7419fecc92..7660b1976c 100644 --- a/pom.xml +++ b/pom.xml @@ -38,6 +38,14 @@ under the License. common spark spark-integration + + contrib/example diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index be74571e64..5d17e0468e 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -27,19 +27,19 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.internal.Logging /** - * Process-wide singleton that discovers and exposes contrib extensions found on the - * classpath via `java.util.ServiceLoader`. + * Process-wide singleton that discovers and exposes contrib extensions found on the classpath via + * `java.util.ServiceLoader`. * * Discovery happens once per JVM, idempotent: the first `load()` call enumerates every * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` and - * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource on the - * Comet classloader. Subsequent calls are no-ops. + * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource on the Comet + * classloader. Subsequent calls are no-ops. * - * `CometSparkSessionExtensions.apply` calls `load()` during Comet extension installation - * (PR1.6) so contrib JARs are picked up automatically when present. + * `CometSparkSessionExtensions.apply` calls `load()` during Comet extension installation (PR1.6) + * so contrib JARs are picked up automatically when present. * - * Failures to instantiate individual extensions are logged but do NOT fail Comet - * startup -- a misconfigured contrib JAR shouldn't take down the whole Spark session. + * Failures to instantiate individual extensions are logged but do NOT fail Comet startup -- a + * misconfigured contrib JAR shouldn't take down the whole Spark session. */ object CometExtensionRegistry extends Logging { @@ -48,8 +48,8 @@ object CometExtensionRegistry extends Logging { @volatile private var serdeExts: Seq[CometOperatorSerdeExtension] = Seq.empty /** - * Discover contrib extensions on the classpath. Idempotent. Safe to call from multiple - * threads (only the first call performs discovery). + * Discover contrib extensions on the classpath. Idempotent. Safe to call from multiple threads + * (only the first call performs discovery). */ def load(): Unit = { if (loaded.compareAndSet(false, true)) { @@ -71,8 +71,8 @@ object CometExtensionRegistry extends Logging { def serdeExtensions: Seq[CometOperatorSerdeExtension] = serdeExts /** - * Test-only: reset the registry to the empty state. Lets unit tests re-run discovery - * with a different classpath / overridden services. Not for production use. + * Test-only: reset the registry to the empty state. Lets unit tests re-run discovery with a + * different classpath / overridden services. Not for production use. */ private[comet] def resetForTesting(): Unit = { loaded.set(false) diff --git a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala index dc56ccbdce..9b180523ef 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala @@ -25,14 +25,14 @@ import org.apache.comet.serde.CometOperatorSerde /** * SPI hook that lets a contrib extension contribute additional operator-to-native serdes to - * `CometExecRule`. Used when a contrib needs to translate a contrib-specific physical - * operator (e.g. `CometDeltaNativeScanExec` for Delta) into a native plan -- the contrib - * provides the serde, and `CometExecRule` calls it during plan transformation. + * `CometExecRule`. Used when a contrib needs to translate a contrib-specific physical operator + * (e.g. `CometDeltaNativeScanExec` for Delta) into a native plan -- the contrib provides the + * serde, and `CometExecRule` calls it during plan transformation. * * `CometExecRule` discovers implementations via `CometExtensionRegistry.serdeExtensions` * (ServiceLoader-backed). Each contrib JAR ships a - * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource listing - * its extension class. + * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource listing its + * extension class. * * Implementations MUST be stateless / safe to share across query executions. */ @@ -42,12 +42,12 @@ trait CometOperatorSerdeExtension { def name: String /** - * Mapping of SparkPlan class -> serde. The contrib lists every operator class it knows - * how to translate to native. `CometExecRule` merges these mappings with its built-in - * `allExecs` to dispatch by class identity at conversion time. + * Mapping of SparkPlan class -> serde. The contrib lists every operator class it knows how to + * translate to native. `CometExecRule` merges these mappings with its built-in `allExecs` to + * dispatch by class identity at conversion time. * - * Convention: each contrib's mapping should reference only classes the contrib itself - * defines, so two contribs never claim ownership of the same operator class. + * Convention: each contrib's mapping should reference only classes the contrib itself defines, + * so two contribs never claim ownership of the same operator class. */ def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] } diff --git a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala index 9789378878..0b2170ad26 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala @@ -25,26 +25,24 @@ import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.execution.datasources.v2.BatchScanExec /** - * SPI hook that lets a contrib extension intercept scan transformation in - * `CometScanRule`. Contribs typically use this to recognise a specific table format (Delta, - * Hudi, etc.) and route it through a contrib-specific native execution path. + * SPI hook that lets a contrib extension intercept scan transformation in `CometScanRule`. + * Contribs typically use this to recognise a specific table format (Delta, Hudi, etc.) and route + * it through a contrib-specific native execution path. * * `CometScanRule` discovers implementations via `CometExtensionRegistry.scanExtensions` * (ServiceLoader-backed) and offers each candidate scan to every registered extension in * registration order. The first extension whose [[matches]] returns `true` wins -- its - * [[transformV1]] / [[transformV2]] is called and the returned plan replaces the scan - * branch. If no extension matches, the core's existing file-format dispatch handles the - * scan as before. + * [[transformV1]] / [[transformV2]] is called and the returned plan replaces the scan branch. If + * no extension matches, the core's existing file-format dispatch handles the scan as before. * * Contribs are discovered via the standard Java ServiceLoader. Each contrib JAR ships a - * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` resource listing its - * extension class. + * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` resource listing its extension + * class. * - * Implementations MUST be safe to invoke from `CometScanRule`'s `apply` method -- - * specifically: pure, stateless, side-effect-free with respect to the plan tree (any state - * needed should be derived from `scanExec` / `relation` / the surrounding plan). The - * registry caches instances across plans, so per-plan state on the implementation will - * leak between queries. + * Implementations MUST be safe to invoke from `CometScanRule`'s `apply` method -- specifically: + * pure, stateless, side-effect-free with respect to the plan tree (any state needed should be + * derived from `scanExec` / `relation` / the surrounding plan). The registry caches instances + * across plans, so per-plan state on the implementation will leak between queries. */ trait CometScanRuleExtension { @@ -52,9 +50,9 @@ trait CometScanRuleExtension { def name: String /** - * Whether this extension wants to handle the given V1 scan. Implementations should make a - * cheap decision here (typically file-format class-name probe) so non-matching paths add - * no per-scan overhead. + * Whether this extension wants to handle the given V1 scan. Implementations should make a cheap + * decision here (typically file-format class-name probe) so non-matching paths add no per-scan + * overhead. * * Default returns false; override `matchesV1` and `transformV1` for V1 scan support. */ @@ -63,9 +61,8 @@ trait CometScanRuleExtension { /** * Transform the matched V1 scan. Called only when `matchesV1` returned true. * - * Returning `None` means "I matched but ultimately can't accelerate this one" -- the - * core falls back to its existing file-format dispatch. Returning `Some(plan)` replaces - * the scan subtree. + * Returning `None` means "I matched but ultimately can't accelerate this one" -- the core falls + * back to its existing file-format dispatch. Returning `Some(plan)` replaces the scan subtree. */ def transformV1( plan: SparkPlan, From 8508ec506717e3a661a89c499376cdf40418e96d Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 08:21:30 -0400 Subject: [PATCH 08/27] docs(contrib): add contributor guide for authoring contribs PR1.8 from docs/contrib-delta-migration-plan.md. New doc at docs/source/contributor-guide/contrib-extensions.md walks future contrib authors through: * Architecture overview (JVM JAR + native rlib linked into libcomet via Cargo feature flag, single cdylib at runtime, ContribOp proto envelope as the dispatch hop). * The SPI surface on both sides: traits, registry, error type. * Required files for a new contrib, mirroring contrib/example/. * The three existing-file edits needed (root pom, native workspace, core Cargo features). * End-to-end wire-format flow. * Cargo feature gating semantics (--no-default-features for slim builds; the JVM side is always classpath-driven). * Testing recommendations modeled on contrib/example/'s suite. * Cross-references to the migration plan and the SPI crate. Closes PR1.8. With this commit, all eight PR1 deliverables from docs/contrib-delta-migration-plan.md are in place on the comet-contrib-spi branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../contributor-guide/contrib-extensions.md | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 docs/source/contributor-guide/contrib-extensions.md diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md new file mode 100644 index 0000000000..a35601961e --- /dev/null +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -0,0 +1,160 @@ + + +# Authoring a Comet contrib extension + +A Comet *contrib* is a self-contained extension that lives alongside core but ships +independently. Contribs add support for a specific table format or operator class without +core having to know about them at build time. The first contrib in the tree is +[`contrib/example/`](https://github.com/apache/datafusion-comet/tree/main/contrib/example); +read it top-to-bottom as the worked reference, then come back here for the architectural +context. + +This document covers how the SPI is shaped, which integration points are available, and +the concrete files a new contrib has to ship. + +## Architecture at a glance + +Each contrib has two halves that ship as separate artifacts but are wired together at +build time: + +- **JVM half** — a separate Maven JAR (`comet-contrib--spark${spark.version.short}_${scala.binary.version}`) + containing Scala / Java extension classes. Discovered at runtime via + `java.util.ServiceLoader` from the contrib JAR's `META-INF/services/` entries. + +- **Native half** — a Rust `rlib` crate (NOT `cdylib`) that is **linked INTO core's + `libcomet`** at build time when the matching Cargo feature on core is enabled. There is + exactly one Comet native library at runtime; the contrib's `#[ctor]` registers its + operator planners during library load. + +The wire format between JVM and native uses a single generic envelope on the operator +proto, `ContribOp { kind, payload }`. Core's planner dispatches by `kind`; the contrib's +native crate registers planners against the same `kind` string the contrib's JVM code +writes into the proto. + +## SPI surface + +### JVM side: `org.apache.comet.spi` + +| Trait / Object | Purpose | +|---|---| +| `CometScanRuleExtension` | Intercept scan-tree transformation. Override `matchesV1` / `transformV1` for V1 `FileSourceScanExec`; `matchesV2` / `transformV2` for V2 `BatchScanExec`. The first matching extension wins, returning `None` falls back to core's existing file-format dispatch. | +| `CometOperatorSerdeExtension` | Contribute additional `SparkPlan` class → `CometOperatorSerde` mappings to `CometExecRule`. Used when the contrib has its own physical operator (e.g. a contrib-specific scan exec) that needs native serialization. | +| `CometExtensionRegistry` | Process-wide singleton. `load()` is called once during `CometSparkSessionExtensions.apply`; subsequent calls are no-ops. Test-only `resetForTesting()` for unit tests that need a clean registry. | + +### Native side: `comet-contrib-spi` crate + +| Item | Purpose | +|---|---| +| `trait ContribOperatorPlanner` | Implemented by the contrib's native crate. The `plan(payload, children) -> Arc` method receives the contrib-private payload bytes from the ContribOp envelope and the already-built native children. | +| `register_contrib_planner(kind, planner)` | Process-wide registry. Called from the contrib's `#[ctor::ctor]` at library load. | +| `lookup_contrib_planner_by_kind(kind)` | Used by core's planner; contribs rarely call directly. | +| `ContribError` | Minimal error type. Core converts to its own `ExecutionError` at the dispatch site. | + +The SPI crate is intentionally a thin leaf: it has no dependencies on core. This is what +breaks the would-be cyclic dependency (core links contribs via Cargo feature flags; +contribs need the SPI types — both depend on a third leaf crate instead of each other). + +## Required files (mirror `contrib/example/` exactly) + +``` +contrib// + pom.xml ← Maven module + src/main/scala/org/apache/comet/contrib// + .scala ← CometScanRuleExtension / CometOperatorSerdeExtension impl + src/main/resources/META-INF/services/ + org.apache.comet.spi.CometScanRuleExtension ← one line per extension class + org.apache.comet.spi.CometOperatorSerdeExtension ← (only if you implement serdes) + src/test/scala/org/apache/comet/contrib// + Suite.scala ← integration test + native/ + Cargo.toml ← rlib crate, workspace = "../../../native" + src/lib.rs ← ContribOperatorPlanner impl + #[ctor] registration +``` + +Plus three edits to existing files: + +- **Root `pom.xml`** — add `contrib/` so `mvn install` builds the + contrib. +- **`native/Cargo.toml`** — add `../contrib//native` to the workspace `members` + list (NOT `default-members` — contribs are consumed via core's feature flags). +- **`native/core/Cargo.toml`** — add a `contrib-` feature gate and a matching + optional `dep:` entry. Add the feature to `default = [...]` if you want it on by + default in release builds. + +## Wire-format flow + +1. The contrib's Scala code intercepts a `FileSourceScanExec` (or `BatchScanExec`) + matching its file format. +2. It builds a contrib-private proto message (the payload format is the contrib's + choice). +3. It wraps the payload bytes in `ContribOp(kind = "-", payload = + )` and sets that on the operator proto's `op_struct` field. +4. The proto is shipped through JNI to native. +5. Core's native planner sees `OpStruct::ContribOp`, looks up the planner by `kind`, + calls `planner.plan(payload, children)`. +6. The contrib's native crate decodes `payload` into its own proto type and returns an + `Arc`. +7. Core wraps the result in a `SparkPlan` and continues planning. + +## Cargo feature gate + +Each contrib's native rlib is wired into core via a feature flag. Build core with: + +```bash +# Default release build: all in-tree contribs enabled (contrib-example, future ones too) +cargo build + +# Slim build: zero contrib code in libcomet +cargo build --no-default-features +``` + +The JVM side is **always** conditional: the contrib JAR is its own artifact, and Spark +only picks it up when it's on the classpath. So even with the Cargo feature on, a user +who doesn't add the contrib JAR sees no behaviour change — the contrib's native planner +sits dormant in the registry, waiting for a JVM serde that never calls it. + +## Testing + +`contrib/example/`'s test suite demonstrates the recommended pattern: + +- A unit test that calls `CometExtensionRegistry.load()` and asserts the contrib's + extension is discovered. This catches packaging mistakes (missing `META-INF/services`, + wrong class name, etc.). +- Per-method unit tests for the extension's `matches*` and `transform*` logic. + +For a contrib with a real native operator, additionally write an integration test that: + +- Builds a `ContribOp` payload Scala-side. +- Submits the plan through a real `SparkSession` configured with the contrib JAR on the + classpath. +- Asserts the contrib's native planner was reached (typically by checking against a + result the no-op planner would not produce). + +Core's own regression suite for the SPI dispatch path uses the example contrib as its +test fixture, so PR1's CI doubles as smoke coverage for any future contribs. + +## See also + +- [`docs/contrib-delta-migration-plan.md`](../../../contrib-delta-migration-plan.md) — + the architectural rationale + the two-PR plan that introduced the SPI. +- [`contrib/example/`](https://github.com/apache/datafusion-comet/tree/main/contrib/example) — + the worked reference. +- [`native/contrib-spi/`](https://github.com/apache/datafusion-comet/tree/main/native/contrib-spi) — + the leaf SPI crate. From e018076d4b2a570a5f74506cfe903f7c2612be73 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 08:40:13 -0400 Subject: [PATCH 09/27] feat(contrib): SPI refinements from Delta-port confidence check Three additions surfaced by porting Delta onto the SPI as a local confidence check (port not committed; see PR1-delta-port-findings.md): 1. CometScanRuleExtension.preTransform tree-level hook Default-identity method that runs once per plan in CometScanRule._apply before per-scan dispatch. Lets contribs undo wrapper rewrites their own Catalyst strategies applied (Delta's PreprocessTableWithDVs is the motivating case; its strategy wraps DV-bearing scans in Project(Filter(...)) referencing a synthetic column Comet's reader can't produce). Without this hook, Delta couldn't move into a contrib at all without losing the unwrap step. Shared state between preTransform and transformV1 is the contrib's problem -- the recommended pattern (documented) is Spark's TreeNodeTag mechanism, which the existing CometSpark34AqeDppFallbackRule already uses. 2. Proto layer in contrib/example/ Each contrib now ships its own .proto schema, build.rs running prost-build, and gitignored src/generated/. contrib/example/ carries a trivial ExampleConstantScan { row_count } message; a new ConstantScanPlanner registered under kind="example-constant-scan" decodes the payload via prost::Message::decode and returns an EmptyExec sized by the field. Three new tests: * ctor registers both planners * payload decode-and-build round-trip * bad payload surfaces ContribError::BadPayload This makes the worked reference complete -- future contrib authors have a runnable proto setup to copy. 3. Class-subclass convention documented CometExecRule dispatches by op.getClass. Documented the convention that contribs needing a custom executor should define their own CometScanExec subclass (or similar) and register the serde keyed on that class, rather than reusing a generic class with a stringly-typed scanImpl tag (the legacy Delta pattern that has no analogue in the class-based SPI dispatch). Files touched: spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala contrib/example/native/{build.rs, Cargo.toml, src/lib.rs} contrib/example/native/src/proto/example_op.proto docs/source/contributor-guide/contrib-extensions.md .gitignore + native/Cargo.lock Build state: cargo check across core + contrib-spi + contrib-example clean. cargo test -p comet-contrib-example: 3/3 pass. cargo test -p comet-contrib-spi: 1/1 pass. mvn install all modules: clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + contrib/example/native/Cargo.toml | 7 +- contrib/example/native/build.rs | 39 ++++++ contrib/example/native/src/lib.rs | 125 +++++++++++++++--- .../example/native/src/proto/example_op.proto | 35 +++++ .../contributor-guide/contrib-extensions.md | 39 +++++- native/Cargo.lock | 2 + .../apache/comet/rules/CometScanRule.scala | 13 +- .../comet/spi/CometScanRuleExtension.scala | 22 +++ 9 files changed, 259 insertions(+), 24 deletions(-) create mode 100644 contrib/example/native/build.rs create mode 100644 contrib/example/native/src/proto/example_op.proto diff --git a/.gitignore b/.gitignore index a3c97ff992..9af7d91cc6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ metastore_db/ spark-warehouse/ dependency-reduced-pom.xml native/proto/src/generated +contrib/example/native/src/generated prebuild .flattened-pom.xml rat.txt diff --git a/contrib/example/native/Cargo.toml b/contrib/example/native/Cargo.toml index b86728deb2..08acde3480 100644 --- a/contrib/example/native/Cargo.toml +++ b/contrib/example/native/Cargo.toml @@ -38,9 +38,14 @@ crate-type = ["rlib"] [dependencies] # Depend on the thin SPI crate, NOT on core. This is what breaks the cycle: core -# depends on contribs (Cargo feature → rlib link); both depend on contrib-spi; nothing +# depends on contribs (Cargo feature -> rlib link); both depend on contrib-spi; nothing # depends back on core from a contrib. comet-contrib-spi = { path = "../../../native/contrib-spi" } datafusion = { workspace = true } +prost = "0.14.3" ctor = "0.4" log = "0.4" + +# Each contrib runs its own prost-build over its own .proto files (see build.rs). +[build-dependencies] +prost-build = "0.14.3" diff --git a/contrib/example/native/build.rs b/contrib/example/native/build.rs new file mode 100644 index 0000000000..236360962e --- /dev/null +++ b/contrib/example/native/build.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Build script for the example contrib's proto. Mirrors `native/proto/build.rs`. +//! +//! Each contrib runs its own `prost-build` invocation against its own `.proto` files. +//! This keeps core's proto crate format-agnostic and lets contribs evolve their wire +//! format independently. The generated Rust types live under `src/generated/` and are +//! gitignored. + +use std::{fs, io::Result, path::Path}; + +fn main() -> Result<()> { + println!("cargo:rerun-if-changed=src/proto/"); + + let out_dir = "src/generated"; + if !Path::new(out_dir).is_dir() { + fs::create_dir(out_dir)?; + } + + prost_build::Config::new() + .out_dir(out_dir) + .compile_protos(&["src/proto/example_op.proto"], &["src/proto"])?; + Ok(()) +} diff --git a/contrib/example/native/src/lib.rs b/contrib/example/native/src/lib.rs index 8857deff59..7d076a8e0d 100644 --- a/contrib/example/native/src/lib.rs +++ b/contrib/example/native/src/lib.rs @@ -17,14 +17,26 @@ //! Worked reference implementation of a Comet contrib extension. //! -//! Registers a single `ContribOperatorPlanner` under `kind = "example-no-op"`. The -//! planner is intentionally trivial: it returns a clear `ContribError::Plan` so tests can -//! verify the full dispatch chain (JVM serde → ContribOp envelope → JNI → native planner -//! → contrib registry → this planner) without needing to actually execute anything. +//! Demonstrates two patterns future contribs will follow: //! -//! Real contribs (Delta, Hudi, etc.) replace `NoOpPlanner::plan` with a function that -//! decodes the contrib's own proto message from `payload` and constructs an -//! `ExecutionPlan` for the contrib's native operator. +//! 1. **Dispatch wiring** -- registers a `ContribOperatorPlanner` against a stable +//! `kind` string at lib-init time via `#[ctor::ctor]`. The planner is called from +//! core's `OpStruct::ContribOp` dispatcher with the contrib's payload bytes. +//! +//! 2. **Proto layer** -- the contrib has its own `proto/` directory with its own +//! `.proto` schema (`example_op.proto`). `build.rs` runs `prost-build` over it; +//! generated Rust types live under `src/generated/` (gitignored). The planner +//! decodes the payload via `prost::Message::decode` -- the same way real contribs +//! (Delta etc.) will. +//! +//! Two planner kinds are registered: +//! +//! * `example-no-op` -- returns a sentinel error. Tests use this to verify +//! the dispatch chain end-to-end. +//! * `example-constant-scan` -- decodes an `ExampleConstantScan` payload, returns +//! an `EmptyExec` sized by the payload's `row_count`. +//! Real contribs (Delta) follow the same pattern, +//! just with their own message and operator. //! //! The whole crate is gated by `native/core/Cargo.toml`'s `contrib-example` feature flag. //! Build core without that feature (`cargo build --no-default-features`) and zero bytes @@ -32,19 +44,28 @@ use std::sync::Arc; -use comet_contrib_spi::{ - register_contrib_planner, ContribError, ContribOperatorPlanner, -}; +use comet_contrib_spi::{register_contrib_planner, ContribError, ContribOperatorPlanner}; +use datafusion::arrow::datatypes::Schema; +use datafusion::physical_plan::empty::EmptyExec; use datafusion::physical_plan::ExecutionPlan; +use prost::Message; -/// Stable identifier the example registers under. The Scala side writes this same string -/// into `ContribOp.kind` when building a payload for the example operator. Convention: -/// `-`. +/// Generated Rust types for the contrib's proto schema. `build.rs` writes the module +/// here at compile time; `src/generated/` is gitignored. +pub mod proto { + include!(concat!("generated/", "comet.contrib.example.rs")); +} + +/// Sentinel kind used by tests to verify dispatch reaches this contrib at all. pub const EXAMPLE_NO_OP_KIND: &str = "example-no-op"; -/// A planner that intentionally does no plan-building work. It exists only to prove the -/// dispatch chain is wired up correctly: tests construct an Operator with this kind, ship -/// it through JNI, and assert that the returned error mentions this string. +/// Kind for the proto-decoding constant-scan planner. Demonstrates the +/// proto-decode-and-build path real contribs will use. +pub const EXAMPLE_CONSTANT_SCAN_KIND: &str = "example-constant-scan"; + +/// A planner that intentionally does no plan-building work. Returns a sentinel error so +/// dispatch tests can assert the message reaches this code path. The payload is ignored; +/// children are ignored. struct NoOpPlanner; impl ContribOperatorPlanner for NoOpPlanner { @@ -60,13 +81,77 @@ impl ContribOperatorPlanner for NoOpPlanner { } } -/// Registers `NoOpPlanner` against `EXAMPLE_NO_OP_KIND` at library-init time. Called by -/// the linker before `main`/`JNI_OnLoad` because of `#[ctor::ctor]`. Comet's main -/// `libcomet` is what gets loaded by the JVM; this constructor runs during its init. +/// Decodes the payload as an `ExampleConstantScan` proto and returns an `EmptyExec` +/// with a schema-less output. Real contribs use the same decode-then-build pattern -- +/// they just decode richer messages and return richer execs. +struct ConstantScanPlanner; + +impl ContribOperatorPlanner for ConstantScanPlanner { + fn plan( + &self, + payload: &[u8], + _children: Vec>, + ) -> Result, ContribError> { + let msg = proto::ExampleConstantScan::decode(payload).map_err(|e| { + ContribError::BadPayload(format!( + "ExampleConstantScan: decode failed: {e}" + )) + })?; + log::info!( + "comet-contrib-example: ConstantScanPlanner produces {} synthetic rows", + msg.row_count + ); + // For the worked example we don't actually populate rows -- EmptyExec is fine to + // demonstrate the build path. Real contribs return their domain-specific exec + // (Delta returns the file scan + DV filter wrap). + Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty())))) + } +} + +/// Registers all of the example contrib's planners against the contrib registry at +/// library-init time. `#[ctor::ctor]` runs this constructor before +/// `main`/`JNI_OnLoad`. Comet's `libcomet` cdylib is the single library the JVM loads; +/// this constructor runs during that one library's init. #[ctor::ctor] fn register() { log::info!( - "comet-contrib-example: registering ContribOperatorPlanner kind={EXAMPLE_NO_OP_KIND:?}" + "comet-contrib-example: registering ContribOperatorPlanners \ + (no-op={EXAMPLE_NO_OP_KIND:?}, constant-scan={EXAMPLE_CONSTANT_SCAN_KIND:?})" ); register_contrib_planner(EXAMPLE_NO_OP_KIND, Arc::new(NoOpPlanner)); + register_contrib_planner(EXAMPLE_CONSTANT_SCAN_KIND, Arc::new(ConstantScanPlanner)); +} + +#[cfg(test)] +mod tests { + use super::*; + use comet_contrib_spi::lookup_contrib_planner_by_kind; + + #[test] + fn ctor_registers_both_planners() { + // The #[ctor] above runs at process-init time for test binaries too. + assert!(lookup_contrib_planner_by_kind(EXAMPLE_NO_OP_KIND).is_some()); + assert!(lookup_contrib_planner_by_kind(EXAMPLE_CONSTANT_SCAN_KIND).is_some()); + } + + #[test] + fn constant_scan_decodes_payload_and_builds() { + let payload = proto::ExampleConstantScan { row_count: 42 }.encode_to_vec(); + let planner = ConstantScanPlanner; + let plan = planner.plan(&payload, vec![]).expect("decode + build"); + // We don't care about the concrete exec type beyond "it built something"; + // confirms the decode path works end-to-end. + assert!(plan.schema().fields().is_empty()); + } + + #[test] + fn constant_scan_surfaces_bad_payload() { + let planner = ConstantScanPlanner; + let bad = b"not a valid proto"; + let err = planner.plan(bad, vec![]).expect_err("garbage should fail decode"); + match err { + ContribError::BadPayload(_) => {} // expected + other => panic!("expected BadPayload, got {other:?}"), + } + } } diff --git a/contrib/example/native/src/proto/example_op.proto b/contrib/example/native/src/proto/example_op.proto new file mode 100644 index 0000000000..59ae4ca761 --- /dev/null +++ b/contrib/example/native/src/proto/example_op.proto @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax = "proto3"; + +// Contrib-private proto package. Each contrib's proto messages live under their own +// package so symbols never collide with core or with other contribs. +package comet.contrib.example; + +// Trivial reference message used by the worked-example contrib. A real contrib's proto +// carries whatever fields its native operator needs (file paths, predicates, schemas, +// deletion vectors, etc.). +// +// The contrib's Scala side fills this message and serializes it into the +// `ContribOp.payload` bytes; the contrib's Rust side decodes the bytes back into this +// struct in its `ContribOperatorPlanner::plan`. +message ExampleConstantScan { + // Number of rows the synthetic constant scan should emit. Bounded by the contrib's + // planner -- this is a test reference, not a useful operator. + uint32 row_count = 1; +} diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index a35601961e..cc3741ab4c 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -54,10 +54,33 @@ writes into the proto. | Trait / Object | Purpose | |---|---| -| `CometScanRuleExtension` | Intercept scan-tree transformation. Override `matchesV1` / `transformV1` for V1 `FileSourceScanExec`; `matchesV2` / `transformV2` for V2 `BatchScanExec`. The first matching extension wins, returning `None` falls back to core's existing file-format dispatch. | +| `CometScanRuleExtension` | Intercept scan-tree transformation. Override `preTransform` for tree-level rewrites (e.g., undoing your format's own Catalyst strategy); `matchesV1` / `transformV1` for V1 `FileSourceScanExec`; `matchesV2` / `transformV2` for V2 `BatchScanExec`. The first matching extension wins, returning `None` falls back to core's existing file-format dispatch. | | `CometOperatorSerdeExtension` | Contribute additional `SparkPlan` class → `CometOperatorSerde` mappings to `CometExecRule`. Used when the contrib has its own physical operator (e.g. a contrib-specific scan exec) that needs native serialization. | | `CometExtensionRegistry` | Process-wide singleton. `load()` is called once during `CometSparkSessionExtensions.apply`; subsequent calls are no-ops. Test-only `resetForTesting()` for unit tests that need a clean registry. | +### Convention: define your own SparkPlan subclass for serde dispatch + +`CometExecRule` dispatches by **class identity** (`op.getClass`) when matching an +operator to its serde. Contribs that need a custom executor (e.g., a contrib-specific +scan exec carrying contrib-private state) should define a dedicated subclass: + +```scala +case class CometMyFormatScanExec(...) extends CometScanExec(..., SCAN_NATIVE_DELTA_COMPAT) +``` + +and register the serde keyed on the new class: + +```scala +class MyFormatSerdeExtension extends CometOperatorSerdeExtension { + override def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] = + Map(classOf[CometMyFormatScanExec] -> CometMyFormatScanSerde) +} +``` + +Avoid relying on the legacy `scanImpl: String` tag pattern on a generic `CometScanExec`; +that approach has no analogue in the SPI's class-based dispatch and would require core +changes to support. + ### Native side: `comet-contrib-spi` crate | Item | Purpose | @@ -85,9 +108,23 @@ contrib// Suite.scala ← integration test native/ Cargo.toml ← rlib crate, workspace = "../../../native" + build.rs ← runs prost-build over your proto schema src/lib.rs ← ContribOperatorPlanner impl + #[ctor] registration + src/proto/.proto ← contrib-private proto schema, your own package + src/generated/ ← (gitignored) prost-build output ``` +### Proto layer + +Each contrib carries its own `.proto` schema defining the message its `ContribOp.payload` +carries. The Scala side serializes that message and sets it on the operator proto's +`contrib_op` envelope; the Rust side `prost::Message::decode`s the same bytes back. +`contrib/example/`'s `ExampleConstantScan { row_count }` is the trivial reference. + +Use your own proto **package name** (e.g., `comet.contrib.`) so symbols never +collide with core or with other contribs. Add `contrib//native/src/generated/` to +the repository `.gitignore` (the build script writes generated `.rs` there each compile). + Plus three edits to existing files: - **Root `pom.xml`** — add `contrib/` so `mvn install` builds the diff --git a/native/Cargo.lock b/native/Cargo.lock index f13c22f1a9..ddd39c7ab0 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1493,6 +1493,8 @@ dependencies = [ "ctor 0.4.3", "datafusion", "log", + "prost", + "prost-build", ] [[package]] diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index b5c70b7451..20410faa0e 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -115,7 +115,16 @@ case class CometScanRule(session: SparkSession) metadataTableSuffix.exists(suffix => scanExec.table.name().endsWith(suffix)) } - val fullPlan = plan + // Contrib SPI tree-level pre-pass. Each registered extension gets a chance to rewrite + // the whole plan tree before per-scan dispatch begins. Used by contribs that need to + // undo wrapper rewrites from their own Catalyst strategies (Delta's + // `PreprocessTableWithDVs` is the canonical case). Fold in registration order so + // contribs see each other's outputs deterministically. Extensions that don't override + // `preTransform` inherit the trait's identity default -- zero overhead. + val prepped = CometExtensionRegistry.scanExtensions + .foldLeft(plan)((p, ext) => ext.preTransform(p, session)) + + val fullPlan = prepped def transformScan(scanNode: SparkPlan): SparkPlan = scanNode match { // Tagged by CometSpark34AqeDppFallbackRule on Spark < 3.5 to keep a peer scan @@ -142,7 +151,7 @@ case class CometScanRule(session: SparkSession) } } - plan.transform { + prepped.transform { case scan if isSupportedScanNode(scan) => transformScan(scan) } } diff --git a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala index 0b2170ad26..376607d518 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala @@ -49,6 +49,28 @@ trait CometScanRuleExtension { /** Human-readable name shown in logs and error messages. Should be unique per extension. */ def name: String + /** + * Tree-level pre-pass run once per plan before per-scan dispatch begins. Default: identity. + * + * Use this to undo wrapper rewrites that a format's own Catalyst strategy applied. The + * canonical example is Delta's `PreprocessTableWithDVs` strategy, which wraps every + * DV-bearing Delta scan in a `Project(Filter(...))` subtree referencing a synthetic + * `__delta_internal_is_row_deleted` column produced by Delta's own reader. Comet reads via + * its own parquet path; without unwrapping that subtree, the synthetic column never gets + * produced and the downstream `Filter` silently drops every row. The Delta contrib's + * `preTransform` strips the wrapper so the clean scan reaches per-scan dispatch. + * + * Implementations MUST NOT modify scans they don't recognise. Multiple registered + * extensions are folded over the plan in registration order; an extension that rewrites + * scans outside its format's domain will silently corrupt other formats' plans. + * + * Shared state between this pre-pass and later `transformV1` / `transformV2` calls is the + * contrib's problem. The recommended pattern is to attach a Spark `TreeNodeTag` to nodes + * during `preTransform` and read it during `transformV1`. Spark's tag mechanism is + * tree-immutable-safe and survives plan transformations. + */ + def preTransform(plan: SparkPlan, session: SparkSession): SparkPlan = plan + /** * Whether this extension wants to handle the given V1 scan. Implementations should make a cheap * decision here (typically file-format class-name probe) so non-matching paths add no per-scan From 14e494483df9aa59dbba83903d3340f027d3ae6c Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 10:10:19 -0400 Subject: [PATCH 10/27] feat(contrib): ContribPlannerContext + ParquetDatasourceParams (SPI gap #4) Extends the contrib SPI so file-scan contribs can build a parquet scan through core without depending on core. Adds to comet-contrib-spi: * ContribPlannerContext trait -- contribs receive a &dyn impl in their plan() call. Methods: session_ctx, build_physical_expr (Catalyst Expr proto -> PhysicalExpr), convert_spark_schema, prepare_object_store, build_parquet_datasource_exec. * ParquetDatasourceParams struct -- 15-field argument bundle mirroring core's init_datasource_exec one-to-one. * ContribOperatorPlanner::plan now takes &dyn ContribPlannerContext as its first argument. Core implements the trait via CorePlannerContext, a thin adapter that borrows &PhysicalPlanner. Dispatcher constructs one per ContribOp arm. Updates the example contrib to take and ignore the new ctx param; tests now use a TestCtx with unimplemented panics for unused trait methods. Surfaced and validated by attempting to host the full Delta dispatcher (~150 lines from delta-kernel-phase-1's OpStruct::DeltaScan arm) on the SPI -- branch contrib-delta-port carries that work. The validation port compiled clean, linked into core's cdylib, and exercised every trait method end-to-end (column-mapping rewrites, DV filter wrapping, schema conversion, expression-planner round-trip, parquet exec construction). Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/native/Cargo.toml | 4 + contrib/example/native/src/lib.rs | 70 +- native/Cargo.lock | 746 +++++++++++++++---- native/contrib-spi/Cargo.toml | 4 +- native/contrib-spi/src/lib.rs | 122 ++- native/core/src/execution/planner.rs | 7 +- native/core/src/execution/planner/contrib.rs | 98 ++- 7 files changed, 872 insertions(+), 179 deletions(-) diff --git a/contrib/example/native/Cargo.toml b/contrib/example/native/Cargo.toml index 08acde3480..e9b23e2ff0 100644 --- a/contrib/example/native/Cargo.toml +++ b/contrib/example/native/Cargo.toml @@ -42,6 +42,10 @@ crate-type = ["rlib"] # depends back on core from a contrib. comet-contrib-spi = { path = "../../../native/contrib-spi" } datafusion = { workspace = true } +# Used only in unit tests to construct a TestCtx that implements ContribPlannerContext; +# kept in [dependencies] (not [dev-dependencies]) because the trait's typed methods take +# spark_expression / spark_operator proto refs and the impl module is not test-gated. +datafusion-comet-proto = { workspace = true } prost = "0.14.3" ctor = "0.4" log = "0.4" diff --git a/contrib/example/native/src/lib.rs b/contrib/example/native/src/lib.rs index 7d076a8e0d..46a0fd4246 100644 --- a/contrib/example/native/src/lib.rs +++ b/contrib/example/native/src/lib.rs @@ -44,7 +44,9 @@ use std::sync::Arc; -use comet_contrib_spi::{register_contrib_planner, ContribError, ContribOperatorPlanner}; +use comet_contrib_spi::{ + register_contrib_planner, ContribError, ContribOperatorPlanner, ContribPlannerContext, +}; use datafusion::arrow::datatypes::Schema; use datafusion::physical_plan::empty::EmptyExec; use datafusion::physical_plan::ExecutionPlan; @@ -71,6 +73,7 @@ struct NoOpPlanner; impl ContribOperatorPlanner for NoOpPlanner { fn plan( &self, + _ctx: &dyn ContribPlannerContext, _payload: &[u8], _children: Vec>, ) -> Result, ContribError> { @@ -89,6 +92,7 @@ struct ConstantScanPlanner; impl ContribOperatorPlanner for ConstantScanPlanner { fn plan( &self, + _ctx: &dyn ContribPlannerContext, payload: &[u8], _children: Vec>, ) -> Result, ContribError> { @@ -125,7 +129,57 @@ fn register() { #[cfg(test)] mod tests { use super::*; - use comet_contrib_spi::lookup_contrib_planner_by_kind; + use comet_contrib_spi::{lookup_contrib_planner_by_kind, ParquetDatasourceParams}; + use datafusion::arrow::datatypes::SchemaRef; + use datafusion::execution::context::SessionContext; + use datafusion::execution::object_store::ObjectStoreUrl; + use datafusion::physical_expr::PhysicalExpr; + use datafusion_comet_proto::{spark_expression, spark_operator}; + use std::collections::HashMap; + + /// Minimal `ContribPlannerContext` for unit-testing contrib planners that don't + /// actually need to build a parquet exec. All methods that the tests don't exercise + /// panic if invoked. + struct TestCtx { + ctx: Arc, + } + impl ContribPlannerContext for TestCtx { + fn session_ctx(&self) -> &Arc { + &self.ctx + } + fn build_physical_expr( + &self, + _expr: &spark_expression::Expr, + _input_schema: SchemaRef, + ) -> Result, ContribError> { + unimplemented!("TestCtx: build_physical_expr not used by this test") + } + fn convert_spark_schema( + &self, + _fields: &[spark_operator::SparkStructField], + ) -> SchemaRef { + unimplemented!("TestCtx: convert_spark_schema not used by this test") + } + fn prepare_object_store( + &self, + _url: String, + _configs: &HashMap, + ) -> Result { + unimplemented!("TestCtx: prepare_object_store not used by this test") + } + fn build_parquet_datasource_exec( + &self, + _params: ParquetDatasourceParams<'_>, + ) -> Result, ContribError> { + unimplemented!("TestCtx: build_parquet_datasource_exec not used by this test") + } + } + + fn test_ctx() -> TestCtx { + TestCtx { + ctx: Arc::new(SessionContext::new()), + } + } #[test] fn ctor_registers_both_planners() { @@ -138,19 +192,21 @@ mod tests { fn constant_scan_decodes_payload_and_builds() { let payload = proto::ExampleConstantScan { row_count: 42 }.encode_to_vec(); let planner = ConstantScanPlanner; - let plan = planner.plan(&payload, vec![]).expect("decode + build"); - // We don't care about the concrete exec type beyond "it built something"; - // confirms the decode path works end-to-end. + let ctx = test_ctx(); + let plan = planner.plan(&ctx, &payload, vec![]).expect("decode + build"); assert!(plan.schema().fields().is_empty()); } #[test] fn constant_scan_surfaces_bad_payload() { let planner = ConstantScanPlanner; + let ctx = test_ctx(); let bad = b"not a valid proto"; - let err = planner.plan(bad, vec![]).expect_err("garbage should fail decode"); + let err = planner + .plan(&ctx, bad, vec![]) + .expect_err("garbage should fail decode"); match err { - ContribError::BadPayload(_) => {} // expected + ContribError::BadPayload(_) => {} other => panic!("expected BadPayload, got {other:?}"), } } diff --git a/native/Cargo.lock b/native/Cargo.lock index ddd39c7ab0..6e4ec5e6f7 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -228,25 +228,60 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd47f2a6ddc39244bd722a27ee5da66c03369d087b9e024eafdb03e98b98ea7" +dependencies = [ + "arrow-arith 57.3.1", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-csv 57.3.1", + "arrow-data 57.3.1", + "arrow-ipc 57.3.1", + "arrow-json 57.3.1", + "arrow-ord 57.3.1", + "arrow-row 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "arrow-string 57.3.1", +] + [[package]] name = "arrow" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "607e64bb911ee4f90483e044fe78f175989148c2892e659a2cd25429e782ec54" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.2.0", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-cast 58.2.0", + "arrow-csv 58.2.0", + "arrow-data 58.2.0", + "arrow-ipc 58.2.0", + "arrow-json 58.2.0", + "arrow-ord 58.2.0", + "arrow-row 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", + "arrow-string 58.2.0", +] + +[[package]] +name = "arrow-arith" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c7bbd679c5418b8639b92be01f361d60013c4906574b578b77b63c78356594c" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "chrono", + "num-traits", ] [[package]] @@ -255,14 +290,33 @@ version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e754319ed8a85d817fe7adf183227e0b5308b82790a737b426c1124626b48118" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", "chrono", "num-traits", ] +[[package]] +name = "arrow-array" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8a4ab47b3f3eac60f7fd31b81e9028fda018607bcc63451aca4f2b755269862" +dependencies = [ + "ahash", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-array" version = "58.2.0" @@ -270,9 +324,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841321891f247aa86c6112c80d83d89cb36e0addd020fa2425085b8eb6c3f579" dependencies = [ "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", "chrono", "chrono-tz", "half", @@ -282,6 +336,18 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-buffer" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d18b89b4c4f4811d0858175e79541fe98e33e18db3b011708bc287b1240593f" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + [[package]] name = "arrow-buffer" version = "58.2.0" @@ -294,18 +360,40 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "722b5c41dd1d14d0a879a1bce92c6fe33f546101bb2acce57a209825edd075b3" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-ord 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + [[package]] name = "arrow-cast" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca5e686972523798f76bef355145bc1ae25a84c731e650268d31ab763c701663" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-ord 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", "atoi", "base64", "chrono", @@ -316,47 +404,113 @@ dependencies = [ "ryu", ] +[[package]] +name = "arrow-csv" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27ddb80a4848e03b1655af496d5ac2563a779e5742fcb48f2ca2e089c9cd2197" +dependencies = [ + "arrow-array 57.3.1", + "arrow-cast 57.3.1", + "arrow-schema 57.3.1", + "chrono", + "csv", + "csv-core", + "regex", +] + [[package]] name = "arrow-csv" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86c276756867fc8186ec380c72c290e6e3b23a1d4fb05df6b1d62d2e62666d48" dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", + "arrow-array 58.2.0", + "arrow-cast 58.2.0", + "arrow-schema 58.2.0", "chrono", "csv", "csv-core", "regex", ] +[[package]] +name = "arrow-data" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1683705c63dcf0d18972759eda48489028cbbff67af7d6bef2c6b7b74ab778a" +dependencies = [ + "arrow-buffer 57.3.1", + "arrow-schema 57.3.1", + "half", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-data" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db3b5846209775b6dc8056d77ff9a032b27043383dd5488abd0b663e265b9373" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.2.0", + "arrow-schema 58.2.0", "half", "num-integer", "num-traits", ] +[[package]] +name = "arrow-ipc" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cf72d04c07229fbf4dbebe7145cac37d7cf7ec582fe705c6b92cb314af096ab" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "flatbuffers", +] + [[package]] name = "arrow-ipc" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd8907ddd8f9fbabf91ec2c85c1d81fe2874e336d2443eb36373595e28b98dd5" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", "flatbuffers", - "lz4_flex", + "lz4_flex 0.13.0", +] + +[[package]] +name = "arrow-json" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a84a905f41fedfcd7679813c89a61dc369c0f932b27aa8dcc6aa051cc781a97d" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "chrono", + "half", + "indexmap 2.14.0", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", ] [[package]] @@ -365,12 +519,12 @@ version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4518c59acc501f10d7dcae397fe12b8db3d81bc7de94456f8a58f9165d6f502" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-cast 58.2.0", + "arrow-ord 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", "chrono", "half", "indexmap 2.14.0", @@ -384,17 +538,43 @@ dependencies = [ "simdutf8", ] +[[package]] +name = "arrow-ord" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "082342947d4e5a2bcccf029a0a0397e21cb3bb8421edd9571d34fb5dd2670256" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", +] + [[package]] name = "arrow-ord" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efa70d9d6b1356f1fb9f1f651b84a725b7e0abb93f188cf7d31f14abfa2f2e6f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", +] + +[[package]] +name = "arrow-row" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a931b520a2a5e22033e01a6f2486b4cdc26f9106b759abeebc320f125e94d7" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "half", ] [[package]] @@ -403,13 +583,22 @@ version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faec88a945338192beffbbd4be0def70135422930caa244ac3cec0cd213b26b4" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", "half", ] +[[package]] +name = "arrow-schema" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4cf0d4a6609679e03002167a61074a21d7b1ad9ea65e462b2c0a97f8a3b2bc6" +dependencies = [ + "bitflags 2.11.1", +] + [[package]] name = "arrow-schema" version = "58.2.0" @@ -421,6 +610,20 @@ dependencies = [ "serde_json", ] +[[package]] +name = "arrow-select" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b320d86a9806923663bb0fd9baa65ecaba81cb0cd77ff8c1768b9716b4ef891" +dependencies = [ + "ahash", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "num-traits", +] + [[package]] name = "arrow-select" version = "58.2.0" @@ -428,24 +631,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a657ab5132e9c8ca3b24eb15a823d0ced38017fe3930ff50167466b02e2d592c" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", "num-traits", ] +[[package]] +name = "arrow-string" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b493e99162e5764077e7823e50ba284858d365922631c7aaefe9487b1abd02c2" +dependencies = [ + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-data 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + [[package]] name = "arrow-string" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6de2efbbd1a9f9780ceb8d1ff5d20421b35863b361e3386b4f571f1fc69fcb8" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", "memchr", "num-traits", "regex", @@ -1485,6 +1705,34 @@ dependencies = [ "memchr", ] +[[package]] +name = "comet-contrib-delta" +version = "0.17.0" +dependencies = [ + "arrow 58.2.0", + "chrono", + "chrono-tz", + "comet-contrib-spi", + "ctor 0.4.3", + "datafusion", + "datafusion-comet-jni-bridge", + "datafusion-comet-proto", + "delta_kernel", + "futures", + "jni 0.22.4", + "log", + "object_store 0.12.5", + "object_store 0.13.2", + "parquet 58.1.0", + "prost", + "prost-build", + "roaring 0.10.12", + "tempfile", + "thiserror 2.0.18", + "tokio", + "url", +] + [[package]] name = "comet-contrib-example" version = "0.17.0" @@ -1492,6 +1740,7 @@ dependencies = [ "comet-contrib-spi", "ctor 0.4.3", "datafusion", + "datafusion-comet-proto", "log", "prost", "prost-build", @@ -1502,6 +1751,7 @@ name = "comet-contrib-spi" version = "0.17.0" dependencies = [ "datafusion", + "datafusion-comet-proto", "log", ] @@ -1511,6 +1761,7 @@ version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ + "crossterm", "unicode-segmentation", "unicode-width", ] @@ -1626,6 +1877,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" + [[package]] name = "crc32c" version = "0.6.8" @@ -1713,6 +1979,29 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" +dependencies = [ + "bitflags 2.11.1", + "crossterm_winapi", + "document-features", + "parking_lot", + "rustix 1.1.4", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.4" @@ -1899,8 +2188,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.2.0", + "arrow-schema 58.2.0", "async-trait", "bytes", "chrono", @@ -1932,9 +2221,9 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.1.0", "rand 0.9.4", "regex", "sqlparser", @@ -1950,7 +2239,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "dashmap", "datafusion-common", @@ -1964,7 +2253,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.13.2", "parking_lot", "tokio", ] @@ -1975,7 +2264,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "datafusion-catalog", "datafusion-common", @@ -1989,19 +2278,20 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.13.2", ] [[package]] name = "datafusion-comet" version = "0.17.0" dependencies = [ - "arrow", + "arrow 58.2.0", "assertables", "async-trait", "aws-config", "aws-credential-types", "bytes", + "comet-contrib-delta", "comet-contrib-example", "comet-contrib-spi", "criterion", @@ -2029,12 +2319,12 @@ dependencies = [ "log4rs", "mimalloc", "num", - "object_store", + "object_store 0.13.2", "object_store_opendal", "once_cell", "opendal 0.56.0", "parking_lot", - "parquet", + "parquet 58.1.0", "paste", "pprof", "procfs", @@ -2054,7 +2344,7 @@ dependencies = [ name = "datafusion-comet-common" version = "0.17.0" dependencies = [ - "arrow", + "arrow 58.2.0", "datafusion", "serde", "serde_json", @@ -2080,14 +2370,14 @@ dependencies = [ name = "datafusion-comet-jni-bridge" version = "0.17.0" dependencies = [ - "arrow", + "arrow 58.2.0", "assertables", "datafusion", "datafusion-comet-common", "jni 0.22.4", "lazy_static", "once_cell", - "parquet", + "parquet 58.1.0", "paste", "prost", "regex", @@ -2104,7 +2394,7 @@ dependencies = [ "datafusion-comet-fs-hdfs3", "fs-hdfs3", "futures", - "object_store", + "object_store 0.13.2", "tokio", ] @@ -2120,7 +2410,7 @@ dependencies = [ name = "datafusion-comet-shuffle" version = "0.17.0" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "bytes", "clap", @@ -2135,8 +2425,8 @@ dependencies = [ "itertools 0.14.0", "jni 0.21.1", "log", - "lz4_flex", - "parquet", + "lz4_flex 0.13.0", + "parquet 58.1.0", "simd-adler32", "snap", "tempfile", @@ -2148,7 +2438,7 @@ dependencies = [ name = "datafusion-comet-spark-expr" version = "0.17.0" dependencies = [ - "arrow", + "arrow 58.2.0", "base64", "chrono", "chrono-tz", @@ -2174,8 +2464,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", - "arrow", - "arrow-ipc", + "arrow 58.2.0", + "arrow-ipc 58.2.0", "chrono", "half", "hashbrown 0.16.1", @@ -2184,8 +2474,8 @@ dependencies = [ "itertools 0.14.0", "libc", "log", - "object_store", - "parquet", + "object_store 0.13.2", + "parquet 58.1.0", "paste", "sqlparser", "tokio", @@ -2209,7 +2499,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow", + "arrow 58.2.0", "async-compression", "async-trait", "bytes", @@ -2230,7 +2520,7 @@ dependencies = [ "itertools 0.14.0", "liblzma", "log", - "object_store", + "object_store 0.13.2", "rand 0.9.4", "tokio", "tokio-util", @@ -2244,8 +2534,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow", - "arrow-ipc", + "arrow 58.2.0", + "arrow-ipc 58.2.0", "async-trait", "bytes", "datafusion-common", @@ -2258,7 +2548,7 @@ dependencies = [ "datafusion-session", "futures", "itertools 0.14.0", - "object_store", + "object_store 0.13.2", "tokio", ] @@ -2268,7 +2558,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "bytes", "datafusion-common", @@ -2280,7 +2570,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store", + "object_store 0.13.2", "regex", "tokio", ] @@ -2291,7 +2581,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "bytes", "datafusion-common", @@ -2303,7 +2593,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store", + "object_store 0.13.2", "serde_json", "tokio", "tokio-stream", @@ -2315,7 +2605,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "bytes", "datafusion-common", @@ -2333,9 +2623,9 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store", + "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.1.0", "tokio", ] @@ -2351,8 +2641,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.2.0", + "arrow-buffer 58.2.0", "async-trait", "chrono", "dashmap", @@ -2361,9 +2651,9 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "log", - "object_store", + "object_store 0.13.2", "parking_lot", - "parquet", + "parquet 58.1.0", "rand 0.9.4", "tempfile", "url", @@ -2375,7 +2665,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "chrono", "datafusion-common", @@ -2397,7 +2687,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow", + "arrow 58.2.0", "datafusion-common", "indexmap 2.14.0", "itertools 0.14.0", @@ -2410,8 +2700,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow", - "arrow-buffer", + "arrow 58.2.0", + "arrow-buffer 58.2.0", "base64", "blake2", "blake3", @@ -2443,7 +2733,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2465,7 +2755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", @@ -2477,8 +2767,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow", - "arrow-ord", + "arrow 58.2.0", + "arrow-ord 58.2.0", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2502,7 +2792,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow", + "arrow 58.2.0", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2518,7 +2808,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -2557,7 +2847,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow", + "arrow 58.2.0", "chrono", "datafusion-common", "datafusion-expr", @@ -2577,7 +2867,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -2599,7 +2889,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-expr", "datafusion-functions", @@ -2615,7 +2905,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", - "arrow", + "arrow 58.2.0", "chrono", "datafusion-common", "datafusion-expr-common", @@ -2631,7 +2921,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -2650,9 +2940,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", - "arrow", - "arrow-ord", - "arrow-schema", + "arrow 58.2.0", + "arrow-ord 58.2.0", + "arrow-schema 58.2.0", "async-trait", "datafusion-common", "datafusion-common-runtime", @@ -2681,7 +2971,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow", + "arrow 58.2.0", "datafusion-common", "datafusion-datasource", "datafusion-expr-common", @@ -2712,7 +3002,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" dependencies = [ - "arrow", + "arrow 58.2.0", "bigdecimal", "chrono", "crc32fast", @@ -2739,7 +3029,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow", + "arrow 58.2.0", "bigdecimal", "chrono", "datafusion-common", @@ -2760,6 +3050,48 @@ dependencies = [ "uuid", ] +[[package]] +name = "delta_kernel" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06f7fc164b1557731fcc68a198e813811a000efade0f112d4f0a002e65042b83" +dependencies = [ + "arrow 57.3.1", + "bytes", + "chrono", + "comfy-table", + "crc", + "delta_kernel_derive", + "futures", + "indexmap 2.14.0", + "itertools 0.14.0", + "object_store 0.12.5", + "parquet 57.3.1", + "reqwest 0.12.28", + "roaring 0.11.3", + "rustc_version", + "serde", + "serde_json", + "strum", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "uuid", + "z85", +] + +[[package]] +name = "delta_kernel_derive" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86815a2c475835751ffa9b8d9ac8ed86cf86294304c42bedd1103d54f25ecbfe" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "der" version = "0.7.10" @@ -2890,6 +3222,15 @@ dependencies = [ "const-random", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + [[package]] name = "dtor" version = "0.0.6" @@ -3626,14 +3967,14 @@ dependencies = [ "anyhow", "apache-avro", "array-init", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 58.2.0", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-cast 58.2.0", + "arrow-ord 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", + "arrow-string 58.2.0", "as-any", "async-trait", "backon", @@ -3652,10 +3993,10 @@ dependencies = [ "murmur3", "once_cell", "ordered-float 4.6.0", - "parquet", + "parquet 58.1.0", "rand 0.9.4", "reqwest 0.12.28", - "roaring", + "roaring 0.11.3", "serde", "serde_bytes", "serde_derive", @@ -4256,6 +4597,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "lock_api" version = "0.4.14" @@ -4316,6 +4663,15 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lz4_flex" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90071f8077f8e40adfc4b7fe9cd495ce316263f19e75c2211eeff3fdf475a3d9" +dependencies = [ + "twox-hash", +] + [[package]] name = "lz4_flex" version = "0.13.0" @@ -4570,6 +4926,44 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "form_urlencoded", + "futures", + "http 1.4.0", + "http-body-util", + "httparse", + "humantime", + "hyper", + "itertools 0.14.0", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml 0.38.4", + "rand 0.9.4", + "reqwest 0.12.28", + "ring", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "object_store" version = "0.13.2" @@ -4621,7 +5015,7 @@ dependencies = [ "chrono", "futures", "mea", - "object_store", + "object_store 0.13.2", "opendal 0.56.0", "pin-project", "tokio", @@ -4850,6 +5244,43 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "57.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e832c6aa20310fc6de7ea5a3f4e20d34fd83e3b43229d32b81ffe5c14d74692" +dependencies = [ + "ahash", + "arrow-array 57.3.1", + "arrow-buffer 57.3.1", + "arrow-cast 57.3.1", + "arrow-data 57.3.1", + "arrow-ipc 57.3.1", + "arrow-schema 57.3.1", + "arrow-select 57.3.1", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.12.2", + "num-bigint", + "num-integer", + "num-traits", + "object_store 0.12.5", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "parquet" version = "58.1.0" @@ -4857,12 +5288,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 58.2.0", + "arrow-buffer 58.2.0", + "arrow-data 58.2.0", + "arrow-ipc 58.2.0", + "arrow-schema 58.2.0", + "arrow-select 58.2.0", "base64", "brotli", "bytes", @@ -4871,11 +5302,11 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex", + "lz4_flex 0.13.0", "num-bigint", "num-integer", "num-traits", - "object_store", + "object_store 0.13.2", "parquet-variant", "parquet-variant-compute", "parquet-variant-json", @@ -4896,7 +5327,7 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf493f3c9ddd984d0efb019f67343e4aa4bab893931f6a14b82083065dc3d28" dependencies = [ - "arrow-schema", + "arrow-schema 58.2.0", "chrono", "half", "indexmap 2.14.0", @@ -4910,8 +5341,8 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac038d46a503a7d563b4f5df5802c4315d5343d009feab195d15ac512b4cb27" dependencies = [ - "arrow", - "arrow-schema", + "arrow 58.2.0", + "arrow-schema 58.2.0", "chrono", "half", "indexmap 2.14.0", @@ -4927,7 +5358,7 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "015a09c2ffe5108766c7c1235c307b8a3c2ea64eca38455ba1a7f3a7f32f16e2" dependencies = [ - "arrow-schema", + "arrow-schema 58.2.0", "base64", "chrono", "parquet-variant", @@ -5748,6 +6179,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "roaring" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" +dependencies = [ + "bytemuck", + "byteorder", +] + [[package]] name = "roaring" version = "0.11.3" @@ -5869,6 +6310,15 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "rustls-pki-types" version = "1.14.0" @@ -6756,6 +7206,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -6954,6 +7405,7 @@ checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", + "rand 0.10.1", "serde_core", "wasm-bindgen", ] @@ -7628,6 +8080,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "z85" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6e61e59a957b7ccee15d2049f86e8bfd6f66968fcd88f018950662d9b86e675" + [[package]] name = "zerocopy" version = "0.8.48" diff --git a/native/contrib-spi/Cargo.toml b/native/contrib-spi/Cargo.toml index eea4855cd8..08e1b2662a 100644 --- a/native/contrib-spi/Cargo.toml +++ b/native/contrib-spi/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "comet-contrib-spi" -description = "Stable SPI surface that contrib crates and Comet's core both depend on. Defines the ContribOperatorPlanner trait, the process-wide registry, and the lightweight error type. Separating this from the core crate breaks what would otherwise be a cyclic dependency (core links contribs via Cargo feature flags; contribs need core types)." +description = "Stable SPI surface that contrib crates and Comet's core both depend on. Defines the ContribOperatorPlanner trait, ContribPlannerContext, the process-wide registry, and the lightweight error type. Separating this from the core crate breaks what would otherwise be a cyclic dependency (core links contribs via Cargo feature flags; contribs need core types)." version = { workspace = true } homepage = { workspace = true } repository = { workspace = true } @@ -26,5 +26,7 @@ license = { workspace = true } edition = { workspace = true } [dependencies] +# Public types in the SPI reference these crates. Pinning matches core via workspace. datafusion = { workspace = true } +datafusion-comet-proto = { workspace = true } log = "0.4" diff --git a/native/contrib-spi/src/lib.rs b/native/contrib-spi/src/lib.rs index 89b6471054..f40f7bbe2c 100644 --- a/native/contrib-spi/src/lib.rs +++ b/native/contrib-spi/src/lib.rs @@ -18,28 +18,45 @@ //! Thin SPI crate shared between Comet's core and every contrib crate. //! //! Both core (`datafusion-comet`) and individual contribs (`comet-contrib-example`, -//! eventually `comet-contrib-delta`) depend on THIS crate, NOT on each other. This avoids -//! a cyclic dependency: core wires contribs in via Cargo feature flags, and contribs need +//! `comet-contrib-delta`, ...) depend on THIS crate, NOT on each other. This avoids a +//! cyclic dependency: core wires contribs in via Cargo feature flags, and contribs need //! the SPI types to implement the trait. With the SPI in a third crate, the dependency //! graph is a DAG. //! //! Surface: -//! * [`ContribOperatorPlanner`] — the trait contribs implement. -//! * [`register_contrib_planner`] / [`lookup_contrib_planner_by_kind`] — -//! process-wide registry, expected to be populated from a contrib's `#[ctor]`. -//! * [`registered_contrib_kinds`] — diagnostics. +//! * [`ContribOperatorPlanner`] -- the trait contribs implement. +//! * [`ContribPlannerContext`] -- the trait core implements; gives contribs access +//! to the parquet exec builder, expression planner, +//! object-store registration, and session context. +//! * [`ParquetDatasourceParams`] -- argument bundle for the parquet exec builder. +//! * [`register_contrib_planner`] / [`lookup_contrib_planner_by_kind`] -- +//! process-wide registry, expected to be populated +//! from a contrib's `#[ctor]`. +//! * [`registered_contrib_kinds`] -- diagnostics. use std::{ collections::HashMap, sync::{Arc, OnceLock, RwLock}, }; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::{ + arrow::datatypes::SchemaRef, + common::ScalarValue, + datasource::listing::PartitionedFile, + execution::{context::SessionContext, object_store::ObjectStoreUrl}, + physical_expr::PhysicalExpr, + physical_plan::{expressions::Column, ExecutionPlan}, +}; +use datafusion_comet_proto::{spark_expression, spark_operator}; /// Implemented by each contrib. Called from core's planner when an `OpStruct::ContribOp` /// with the contrib's `kind` is encountered. /// /// The contract is intentionally minimal: +/// * `ctx` is a handle to core-side planner services (parquet exec builder, +/// expression planner, object-store registration, session context). Contribs reach +/// into core through this trait rather than depending on core directly, which keeps +/// the dependency graph acyclic. /// * `payload` is the raw bytes from `ContribOp.payload`. The contrib decodes it into /// whatever proto / serde format it uses internally; core never inspects. /// * `children` is the list of already-built native children (in spark-plan child @@ -48,19 +65,96 @@ use datafusion::physical_plan::ExecutionPlan; /// * The returned `Arc` is the contrib's operator. Core wraps it /// into a `SparkPlan` and threads it through the rest of the plan tree. /// -/// Implementations MUST be `Send + Sync` and idempotent — the same `(payload, children)` +/// Implementations MUST be `Send + Sync` and idempotent -- the same `(payload, children)` /// must always produce a functionally equivalent plan, so core can cache or re-plan. pub trait ContribOperatorPlanner: Send + Sync { fn plan( &self, + ctx: &dyn ContribPlannerContext, payload: &[u8], children: Vec>, ) -> Result, ContribError>; } -/// Error type returned by [`ContribOperatorPlanner::plan`]. Kept distinct from core's -/// `ExecutionError` so this crate stays free of core's dependency tree. Core converts -/// `ContribError` into its own `ExecutionError` at the dispatch site. +/// Argument bundle for [`ContribPlannerContext::build_parquet_datasource_exec`]. Mirrors +/// core's internal `init_datasource_exec` signature one-to-one, so the trait method is a +/// thin forward. +/// +/// Held by value rather than `&self`/builder pattern because contribs build it once per +/// plan call -- the verbose layout is easier to read at the call site than a builder +/// would be. +pub struct ParquetDatasourceParams<'a> { + pub required_schema: SchemaRef, + pub data_schema: Option, + pub partition_schema: Option, + pub object_store_url: ObjectStoreUrl, + pub file_groups: Vec>, + pub projection_vector: Option>, + pub data_filters: Option>>, + pub default_values: Option>, + pub session_timezone: &'a str, + pub case_sensitive: bool, + pub return_null_struct_if_all_fields_missing: bool, + pub encryption_enabled: bool, + pub use_field_id: bool, + pub ignore_missing_field_id: bool, +} + +/// Planner services exposed by core to contribs. Core implements this trait against its +/// `PhysicalPlanner` + `SessionContext`; contribs receive a `&dyn ContribPlannerContext` +/// in their [`ContribOperatorPlanner::plan`] call and reach into core through it. +/// +/// All trait methods are infallible at the trait-bound level but return `ContribError` +/// for runtime failures, so contribs can propagate without converting between error +/// types. +// Note: no `Send + Sync` bound -- `&dyn ContribPlannerContext` is only held for the +// duration of a synchronous `plan()` call, so it doesn't need to cross threads. The +// natural core-side impl borrows the `PhysicalPlanner` (which carries JNI handles that +// aren't `Send`), and adding the bound here would force an awkward `Arc>` +// dance for no gain. +pub trait ContribPlannerContext { + /// The session context the plan is being built under. Contribs need this to register + /// object stores on `runtime_env()` and to read session-level configs (timezone, + /// case sensitivity, etc) that aren't already on `ParquetDatasourceParams`. + fn session_ctx(&self) -> &Arc; + + /// Convert a Catalyst-side Spark expression proto into a DataFusion `PhysicalExpr` + /// against the given input schema. Used by file-scan contribs to convert data + /// filters from their proto-side `Expr` form into the typed `PhysicalExpr`s that + /// `ParquetSource` consumes. + fn build_physical_expr( + &self, + expr: &spark_expression::Expr, + input_schema: SchemaRef, + ) -> Result, ContribError>; + + /// Convert a slice of Spark struct fields (the proto representation of a Spark + /// schema) into an Arrow `SchemaRef`. This is a pure proto-to-arrow conversion -- + /// no side effects, no session state. + fn convert_spark_schema(&self, fields: &[spark_operator::SparkStructField]) -> SchemaRef; + + /// Register an object store on the runtime env for the given URL's scheme + bucket, + /// using `object_store_configs` for credentials / endpoint overrides. Returns the + /// canonical `ObjectStoreUrl` that the contrib should attach to its `PartitionedFile`s. + fn prepare_object_store( + &self, + any_file_url: String, + object_store_configs: &HashMap, + ) -> Result; + + /// Build a `DataSourceExec` over Comet's tuned `ParquetSource`. This is the single + /// most important method on the trait -- every file-scan contrib (Delta, Iceberg) + /// goes through here so the contrib doesn't have to rebuild Comet's parquet plumbing. + fn build_parquet_datasource_exec( + &self, + params: ParquetDatasourceParams<'_>, + ) -> Result, ContribError>; +} + +/// Error type returned by [`ContribOperatorPlanner::plan`] and the trait methods on +/// [`ContribPlannerContext`]. Kept distinct from core's `ExecutionError` so this crate +/// stays free of core's dependency tree. Core converts `ContribError` into its own +/// `ExecutionError` at the dispatch site. #[derive(Debug)] pub enum ContribError { /// Generic failure. Use this for cases that don't fit the more specific variants. @@ -139,6 +233,7 @@ pub fn registered_contrib_kinds() -> Vec { #[cfg(test)] mod tests { use super::*; + use datafusion::arrow::datatypes::Schema; use datafusion::physical_plan::empty::EmptyExec; use std::sync::Arc; @@ -146,12 +241,11 @@ mod tests { impl ContribOperatorPlanner for AlwaysEmpty { fn plan( &self, + _ctx: &dyn ContribPlannerContext, _payload: &[u8], _children: Vec>, ) -> Result, ContribError> { - Ok(Arc::new(EmptyExec::new(Arc::new( - datafusion::arrow::datatypes::Schema::empty(), - )))) + Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty())))) } } diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 445749ae4e..54d7235ef6 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1967,7 +1967,9 @@ impl PhysicalPlanner { // by the time we reach this arm the registry is already warm. Missing // registrations typically mean the JVM JAR is on the classpath but core // was built without the corresponding `contrib-` Cargo feature. - use crate::execution::planner::contrib::lookup_contrib_planner_by_kind; + use crate::execution::planner::contrib::{ + lookup_contrib_planner_by_kind, CorePlannerContext, + }; let kind = contrib_op.kind.as_str(); let planner = lookup_contrib_planner_by_kind(kind).ok_or_else(|| { GeneralError(format!( @@ -1992,8 +1994,9 @@ impl PhysicalPlanner { native_children.push(child_plan.native_plan.clone()); } + let ctx = CorePlannerContext { planner: self }; let exec = planner - .plan(&contrib_op.payload, native_children) + .plan(&ctx, &contrib_op.payload, native_children) .map_err(|e| GeneralError(format!("contrib planner {kind:?}: {e}")))?; Ok(( diff --git a/native/core/src/execution/planner/contrib.rs b/native/core/src/execution/planner/contrib.rs index b78d8b1d7a..834c57b0c2 100644 --- a/native/core/src/execution/planner/contrib.rs +++ b/native/core/src/execution/planner/contrib.rs @@ -15,20 +15,96 @@ // specific language governing permissions and limitations // under the License. -//! Convenience re-exports of the contrib SPI surface. +//! Re-exports + core-side `ContribPlannerContext` adapter. //! -//! The actual trait + registry live in the standalone `comet-contrib-spi` crate so both +//! The SPI trait + registry live in the standalone `comet-contrib-spi` crate so both //! core and contribs can depend on them without forming a dependency cycle (core links -//! contribs via Cargo feature flags, contribs need the SPI types). This module just -//! re-exports the surface so existing `crate::execution::planner::contrib::...` -//! imports inside core continue to resolve. - -// Re-export the parts of the SPI core itself uses (the dispatcher only needs -// `lookup_contrib_planner_by_kind`). The other helpers — `register_contrib_planner`, -// `registered_contrib_kinds`, `ContribError`, `ContribOperatorPlanner` — are exposed -// directly from the `comet_contrib_spi` crate so contribs import them from there. +//! contribs via Cargo feature flags, contribs need the SPI types). This module: +//! +//! 1. re-exports the parts of the SPI core itself imports, so existing +//! `crate::execution::planner::contrib::...` paths keep resolving; +//! 2. provides `CorePlannerContext`, a thin adapter that lets a `&PhysicalPlanner` be +//! passed to contribs as a `&dyn ContribPlannerContext`. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::execution::context::SessionContext; +use datafusion::execution::object_store::ObjectStoreUrl; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_comet_proto::{spark_expression, spark_operator}; + pub use comet_contrib_spi::lookup_contrib_planner_by_kind; -#[allow(unused_imports)] // surfaced for tests + diagnostics; consumed in PR1.7 onwards +#[allow(unused_imports)] // surfaced for tests + diagnostics pub use comet_contrib_spi::{ register_contrib_planner, registered_contrib_kinds, ContribError, ContribOperatorPlanner, + ContribPlannerContext, ParquetDatasourceParams, }; + +use crate::execution::planner::PhysicalPlanner; +use crate::parquet::parquet_exec::init_datasource_exec; +use crate::parquet::parquet_support::prepare_object_store_with_configs; + +/// Adapter that exposes a `&PhysicalPlanner` (plus the session_ctx it carries) as a +/// `ContribPlannerContext`. Construction is cheap -- just borrows the planner. The +/// dispatcher creates one per ContribOp arm. +pub(crate) struct CorePlannerContext<'a> { + pub(crate) planner: &'a PhysicalPlanner, +} + +impl ContribPlannerContext for CorePlannerContext<'_> { + fn session_ctx(&self) -> &Arc { + self.planner.session_ctx() + } + + fn build_physical_expr( + &self, + expr: &spark_expression::Expr, + input_schema: SchemaRef, + ) -> Result, ContribError> { + self.planner + .create_expr(expr, input_schema) + .map_err(|e| ContribError::Plan(format!("create_expr: {e}"))) + } + + fn convert_spark_schema(&self, fields: &[spark_operator::SparkStructField]) -> SchemaRef { + super::convert_spark_types_to_arrow_schema(fields) + } + + fn prepare_object_store( + &self, + url: String, + configs: &HashMap, + ) -> Result { + prepare_object_store_with_configs(self.planner.session_ctx().runtime_env(), url, configs) + .map(|(url, _path)| url) + .map_err(|e| ContribError::Plan(format!("prepare_object_store_with_configs: {e}"))) + } + + fn build_parquet_datasource_exec( + &self, + params: ParquetDatasourceParams<'_>, + ) -> Result, ContribError> { + init_datasource_exec( + params.required_schema, + params.data_schema, + params.partition_schema, + params.object_store_url, + params.file_groups, + params.projection_vector, + params.data_filters, + params.default_values, + params.session_timezone, + params.case_sensitive, + params.return_null_struct_if_all_fields_missing, + self.planner.session_ctx(), + params.encryption_enabled, + params.use_field_id, + params.ignore_missing_field_id, + ) + .map(|e| e as Arc) + .map_err(|e| ContribError::Plan(format!("init_datasource_exec: {e}"))) + } +} From 8930b698cb17ea529d0fb6c35f2915d9657ad15d Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 11:13:08 -0400 Subject: [PATCH 11/27] feat(contrib): review-fix pass (B1-B6, I1-I10, nits, doc updates) Addresses every finding from the first review: Blockers - B1: Test isolation via ScopedContribPlannerRegistration RAII guard + _clear_for_test escape hatch (cfg-gated on test or "test-utils" feature). Negative-lookup test added. - B2: preTransform documented as V1-only; transformV2 explicitly does not receive a plan-tree reference. Trait + dispatcher docs aligned. - B3/B4: #[non_exhaustive] on ParquetDatasourceParams and ContribError. Constructor (`new`) + `with_*` setters on the params struct so contribs don't use struct-literal syntax. WrongChildCount.expected switched from &'static str to String. - B5: CometScanRule preTransform corruption guard -- log warning when an extension replaces a FileSourceScanExec whose relation it does not claim. - B6: Example contrib's #[ctor] wrapped in catch_unwind. Contributor guide documents panic semantics, the logger-not-ready issue (use eprintln!), cross-platform ctor-order nondeterminism. Important - I1: contrib-example removed from default features. Production cdylib has empty registered_contrib_kinds(). Build docs updated. - I2: CometExtensionRegistry.load() moved out of CometSparkSessionExtensions.apply into a lazy call at the top of CometScanRule._apply (after isCometLoaded). Sessions that never enable Comet pay zero ServiceLoader cost. - I3: CometExtensionRegistry.mergedSerdes pre-computed at load() time; CometExecRule now consults it via .get() instead of rebuilding the merged map per operator transform. Duplicate-class detector logs a warning when two contribs claim the same SparkPlan class. - I4: Multi-extension dispatch now loops over every matching extension and takes the first that returns Some; "matched but declined" continues to the next extension before falling back to core. Trait docs updated. - I5: Unit tests added: ParquetDatasourceParams constructor + setters with distinguishable bool tuple, CorePlannerContext smoke test that builds a DataSourceExec and verifies schema flow-through, session_ctx Arc identity, empty-schema conversion. 7 tests total in contrib-spi, 3 in core. - I6: prepare_object_store returns (ObjectStoreUrl, object_store::path::Path); contribs no longer have to reimplement URL parsing for PartitionedFile.location. - I7: preTransform fold gated on COMET_NATIVE_SCAN_ENABLED. Disabled-Comet + Delta JAR on classpath no longer strips load-bearing Catalyst wrappers. - I8: Display test for every ContribError variant verifies the dispatcher's format!("contrib planner {kind:?}: {e}") preserves variant-discriminating info. - I9: Dispatcher rejects ContribOp payloads larger than 16 MiB with a descriptive error. - I10: CometExtensionRegistry.load() logs a positive INFO message when no extensions are discovered, so users get a signal in deploy modes where the context classloader doesn't see the contrib JAR. Nits - N2: ConstantScanPlanner log moved from info! to debug!. - N4: Dead doc link to docs/contrib-delta-migration-plan.md removed. - N5: ExampleScanRuleExtensionSuite no longer calls SparkSession.stop() in finally (that tears down the JVM-wide singleton). - N8: Trimmed comet-contrib-example crate description. - N9: operator_registry test asserts ContribOp returns None from get_operator_type. - N10: row_count=0 covered by an additional unit test in the example contrib. Open-question documentation (contributor guide) - Send+Sync asymmetry between ContribOperatorPlanner and ContribPlannerContext. - SPI is alpha-stable; #[non_exhaustive] markers make additive changes minor. - &[u8] vs Bytes rationale. - --no-default-features verification + CI matrix suggestion. - Thin JAR convention + shading guidance. - Registry-primitive note (may switch to ArcSwap; API unchanged). - WrongChildCount.expected convention (free-form phrase). Verified - cargo check --no-default-features and cargo check (default features) green. - cargo test -p comet-contrib-spi -p comet-contrib-example: 10 tests pass. - cargo test -p datafusion-comet --lib (filtered): 4 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/native/Cargo.toml | 2 +- contrib/example/native/src/lib.rs | 49 +- .../ExampleScanRuleExtensionSuite.scala | 40 +- .../contributor-guide/contrib-extensions.md | 137 +++- native/Cargo.lock | 745 ++++-------------- native/contrib-spi/Cargo.toml | 9 + native/contrib-spi/src/lib.rs | 289 ++++++- native/core/Cargo.toml | 10 +- native/core/src/execution/planner.rs | 17 + native/core/src/execution/planner/contrib.rs | 62 +- .../execution/planner/operator_registry.rs | 23 + .../comet/CometSparkSessionExtensions.scala | 7 +- .../apache/comet/rules/CometExecRule.scala | 16 +- .../apache/comet/rules/CometScanRule.scala | 86 +- .../comet/spi/CometExtensionRegistry.scala | 56 ++ .../comet/spi/CometScanRuleExtension.scala | 46 +- 16 files changed, 876 insertions(+), 718 deletions(-) diff --git a/contrib/example/native/Cargo.toml b/contrib/example/native/Cargo.toml index e9b23e2ff0..698814c603 100644 --- a/contrib/example/native/Cargo.toml +++ b/contrib/example/native/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "comet-contrib-example" -description = "Worked reference implementation of a Comet contrib extension. Registers a no-op ContribOperatorPlanner under kind=\"example-no-op\" so the SPI dispatch path can be exercised end-to-end in tests." +description = "Worked reference implementation of a Comet contrib extension. Not published; bundled as a SPI dispatch test fixture." # Contrib crates live OUTSIDE the workspace root directory (`native/`) but are listed as # workspace members in `native/Cargo.toml`. Cargo's auto-discovery walks up the directory # tree, so without the explicit pointer it can't find `native/Cargo.toml` from diff --git a/contrib/example/native/src/lib.rs b/contrib/example/native/src/lib.rs index 46a0fd4246..24061f8f53 100644 --- a/contrib/example/native/src/lib.rs +++ b/contrib/example/native/src/lib.rs @@ -101,7 +101,7 @@ impl ContribOperatorPlanner for ConstantScanPlanner { "ExampleConstantScan: decode failed: {e}" )) })?; - log::info!( + log::debug!( "comet-contrib-example: ConstantScanPlanner produces {} synthetic rows", msg.row_count ); @@ -113,17 +113,34 @@ impl ContribOperatorPlanner for ConstantScanPlanner { } /// Registers all of the example contrib's planners against the contrib registry at -/// library-init time. `#[ctor::ctor]` runs this constructor before -/// `main`/`JNI_OnLoad`. Comet's `libcomet` cdylib is the single library the JVM loads; -/// this constructor runs during that one library's init. +/// library-init time. `#[ctor::ctor]` runs this constructor before `main`/`JNI_OnLoad`. +/// Comet's `libcomet` cdylib is the single library the JVM loads; this constructor runs +/// during that one library's init. +/// +/// # Panic safety +/// +/// The body is wrapped in `catch_unwind` and writes to stderr on failure. A panic inside +/// `#[ctor]` aborts the entire JVM process before `JNI_OnLoad` runs and produces no +/// diagnostic on macOS/Linux without this wrapper. Every contrib's `#[ctor]` should +/// follow the same pattern; see `docs/source/contributor-guide/contrib-extensions.md`. +/// +/// # Logging +/// +/// `log::*!` macros inside `#[ctor]` are no-ops because Comet's logger is initialised +/// later, in `Java_org_apache_comet_NativeBase_init`. Use `eprintln!` (or nothing) for +/// any ctor diagnostics that must be visible. #[ctor::ctor] fn register() { - log::info!( - "comet-contrib-example: registering ContribOperatorPlanners \ - (no-op={EXAMPLE_NO_OP_KIND:?}, constant-scan={EXAMPLE_CONSTANT_SCAN_KIND:?})" - ); - register_contrib_planner(EXAMPLE_NO_OP_KIND, Arc::new(NoOpPlanner)); - register_contrib_planner(EXAMPLE_CONSTANT_SCAN_KIND, Arc::new(ConstantScanPlanner)); + let _ = std::panic::catch_unwind(|| { + register_contrib_planner(EXAMPLE_NO_OP_KIND, Arc::new(NoOpPlanner)); + register_contrib_planner(EXAMPLE_CONSTANT_SCAN_KIND, Arc::new(ConstantScanPlanner)); + }) + .map_err(|panic| { + eprintln!( + "comet-contrib-example: #[ctor] panicked during planner registration; \ + contrib will not be available. panic={panic:?}" + ); + }); } #[cfg(test)] @@ -164,7 +181,7 @@ mod tests { &self, _url: String, _configs: &HashMap, - ) -> Result { + ) -> Result<(ObjectStoreUrl, datafusion::object_store::path::Path), ContribError> { unimplemented!("TestCtx: prepare_object_store not used by this test") } fn build_parquet_datasource_exec( @@ -197,6 +214,16 @@ mod tests { assert!(plan.schema().fields().is_empty()); } + #[test] + fn constant_scan_handles_zero_rows() { + // Worked-example coverage: row_count = 0 must not be a special case. + let payload = proto::ExampleConstantScan { row_count: 0 }.encode_to_vec(); + let planner = ConstantScanPlanner; + let ctx = test_ctx(); + let plan = planner.plan(&ctx, &payload, vec![]).expect("decode + build"); + assert!(plan.schema().fields().is_empty()); + } + #[test] fn constant_scan_surfaces_bad_payload() { let planner = ConstantScanPlanner; diff --git a/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala b/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala index 314acd9107..7de0fd6b34 100644 --- a/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala +++ b/contrib/example/src/test/scala/org/apache/comet/contrib/example/ExampleScanRuleExtensionSuite.scala @@ -54,31 +54,31 @@ class ExampleScanRuleExtensionSuite extends AnyFunSuite { // We construct a minimal HadoopFsRelation just enough to call matchesV1. The trait // method only reads `relation.options` so we don't need a real file format/schema. + // + // Use getOrCreate so the test reuses any already-running singleton SparkSession + // (e.g., from another suite). Critically, DO NOT call `stop()` in a finally block: + // stop() tears down the JVM-wide singleton and breaks every other test sharing it. val sparkSession = org.apache.spark.sql.SparkSession .builder() .master("local[1]") .appName("ExampleScanRuleExtensionSuite") .getOrCreate() - try { - val relationWithoutMarker = new org.apache.spark.sql.execution.datasources.HadoopFsRelation( - location = new org.apache.spark.sql.execution.datasources.InMemoryFileIndex( - sparkSession, - Seq.empty, - Map.empty, - None), - partitionSchema = new org.apache.spark.sql.types.StructType(), - dataSchema = new org.apache.spark.sql.types.StructType(), - bucketSpec = None, - fileFormat = new org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat(), - options = Map.empty)(sparkSession) - assert(!ext.matchesV1(relationWithoutMarker), "no marker -> no match") + val relationWithoutMarker = new org.apache.spark.sql.execution.datasources.HadoopFsRelation( + location = new org.apache.spark.sql.execution.datasources.InMemoryFileIndex( + sparkSession, + Seq.empty, + Map.empty, + None), + partitionSchema = new org.apache.spark.sql.types.StructType(), + dataSchema = new org.apache.spark.sql.types.StructType(), + bucketSpec = None, + fileFormat = new org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat(), + options = Map.empty)(sparkSession) + assert(!ext.matchesV1(relationWithoutMarker), "no marker -> no match") - val relationWithMarker = relationWithoutMarker.copy(options = Map( - ExampleScanRuleExtension.MarkerOptionKey -> - ExampleScanRuleExtension.MarkerOptionValue))(sparkSession) - assert(ext.matchesV1(relationWithMarker), "marker present -> match") - } finally { - sparkSession.stop() - } + val relationWithMarker = relationWithoutMarker.copy(options = Map( + ExampleScanRuleExtension.MarkerOptionKey -> + ExampleScanRuleExtension.MarkerOptionValue))(sparkSession) + assert(ext.matchesV1(relationWithMarker), "marker present -> match") } } diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index cc3741ab4c..7061069f73 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -29,6 +29,15 @@ context. This document covers how the SPI is shaped, which integration points are available, and the concrete files a new contrib has to ship. +## SPI stability + +The contrib SPI surface is currently **alpha** — minor versions may carry breaking +changes during the early-adopter period. Public types in `comet-contrib-spi` and the +Scala SPI traits are marked `#[non_exhaustive]` (or open for inheritance) so additive +changes are minor bumps. Removals and renames will be called out in release notes. Lock +your contrib to a specific Comet patch version until the SPI is declared stable in a +later release. + ## Architecture at a glance Each contrib has two halves that ship as separate artifacts but are wired together at @@ -54,9 +63,26 @@ writes into the proto. | Trait / Object | Purpose | |---|---| -| `CometScanRuleExtension` | Intercept scan-tree transformation. Override `preTransform` for tree-level rewrites (e.g., undoing your format's own Catalyst strategy); `matchesV1` / `transformV1` for V1 `FileSourceScanExec`; `matchesV2` / `transformV2` for V2 `BatchScanExec`. The first matching extension wins, returning `None` falls back to core's existing file-format dispatch. | -| `CometOperatorSerdeExtension` | Contribute additional `SparkPlan` class → `CometOperatorSerde` mappings to `CometExecRule`. Used when the contrib has its own physical operator (e.g. a contrib-specific scan exec) that needs native serialization. | -| `CometExtensionRegistry` | Process-wide singleton. `load()` is called once during `CometSparkSessionExtensions.apply`; subsequent calls are no-ops. Test-only `resetForTesting()` for unit tests that need a clean registry. | +| `CometScanRuleExtension` | Intercept scan-tree transformation. Override `preTransform` for tree-level rewrites (V1 only — see below); `matchesV1` / `transformV1` for V1 `FileSourceScanExec`; `matchesV2` / `transformV2` for V2 `BatchScanExec`. Dispatch iterates registered extensions in order; the first one whose `match*` returns `true` AND `transform*` returns `Some` wins. `None` means "decline this instance" and dispatch continues to the next matching extension before falling back to core. | +| `CometOperatorSerdeExtension` | Contribute additional `SparkPlan` class → `CometOperatorSerde` mappings to `CometExecRule`. The merged map is computed once at registry load time. Used when the contrib has its own physical operator (e.g., a contrib-specific scan exec) that needs native serialization. Duplicate class keys across contribs are logged as a warning at load. | +| `CometExtensionRegistry` | Process-wide singleton. `load()` is invoked lazily from `CometScanRule._apply` / `CometExecRule.apply` the first time Comet runs against a Comet-enabled session — so Spark sessions that never enable Comet pay zero ServiceLoader cost. Subsequent calls are no-ops. Test-only `resetForTesting()` exists for unit tests that need a clean registry. | + +### `preTransform` is V1-only and disabled when scan is off + +`CometScanRule` folds every registered extension's `preTransform` over the plan tree +once, before per-scan dispatch begins. The rewritten subtree is what `transformV1` +receives. `transformV2` does **not** receive a plan reference — V2 contribs that need +wrapper-stripping must do that work inside `transformV2` against `scanExec.scan` and +`scanExec.children` directly. + +The fold is skipped entirely when `spark.comet.scan.enabled=false`. A contrib's own +Catalyst wrappers (Delta's DV filter, etc.) become load-bearing when Comet's scan is +disabled; stripping them turns into a correctness bug. + +`CometScanRule` also logs a warning when a `FileSourceScanExec` is replaced by an +extension whose `matchesV1` returns false against the original scan's relation — a +contrib that trips this warning is rewriting scans it doesn't recognise and may corrupt +other formats' plans. Narrow your pattern match. ### Convention: define your own SparkPlan subclass for serde dispatch @@ -85,14 +111,46 @@ changes to support. | Item | Purpose | |---|---| -| `trait ContribOperatorPlanner` | Implemented by the contrib's native crate. The `plan(payload, children) -> Arc` method receives the contrib-private payload bytes from the ContribOp envelope and the already-built native children. | +| `trait ContribOperatorPlanner` | Implemented by the contrib's native crate. The `plan(ctx, payload, children) -> Arc` method receives a `&dyn ContribPlannerContext` (handle to core's planner services), the contrib-private payload bytes from the `ContribOp` envelope, and the already-built native children. | +| `trait ContribPlannerContext` | Implemented by core. Exposes the parquet exec builder (`build_parquet_datasource_exec`), expression planner (`build_physical_expr`), schema conversion (`convert_spark_schema`), object-store registration (`prepare_object_store`), and the `SessionContext` itself. Contribs reach into core through this trait rather than depending on `datafusion-comet` directly. | +| `struct ParquetDatasourceParams` | `#[non_exhaustive]` argument bundle for the parquet exec builder. Construct via `ParquetDatasourceParams::new(required_schema, object_store_url, file_groups)` and chain `with_*` setters. Adding fields in future is a minor SemVer bump. | | `register_contrib_planner(kind, planner)` | Process-wide registry. Called from the contrib's `#[ctor::ctor]` at library load. | | `lookup_contrib_planner_by_kind(kind)` | Used by core's planner; contribs rarely call directly. | -| `ContribError` | Minimal error type. Core converts to its own `ExecutionError` at the dispatch site. | +| `ContribError` | `#[non_exhaustive]` minimal error type. Core converts to its own `ExecutionError` at the dispatch site. Variants: `Plan(String)`, `BadPayload(String)`, `WrongChildCount { expected: String, actual: usize }`. Pattern matches MUST include a wildcard arm so future variants don't break consumers. | +| `ScopedContribPlannerRegistration` | `#[cfg(any(test, feature = "test-utils"))]` RAII guard for tests that register a planner without polluting the global registry. Drop restores the previous planner. Pair with `#[serial_test::serial]` if your test asserts on `registered_contrib_kinds()`. | + +The SPI crate is intentionally a thin leaf: it depends only on `datafusion`, +`datafusion-comet-proto`, and `object_store`. This is what breaks the would-be cyclic +dependency (core links contribs via Cargo feature flags; contribs need the SPI types — +both depend on a third leaf crate instead of each other). No core-typed values cross +the trait boundary. -The SPI crate is intentionally a thin leaf: it has no dependencies on core. This is what -breaks the would-be cyclic dependency (core links contribs via Cargo feature flags; -contribs need the SPI types — both depend on a third leaf crate instead of each other). +### Why `ContribOperatorPlanner` is `Send + Sync` but `ContribPlannerContext` isn't + +The planner trait is stored in an `Arc` inside a process-wide registry shared across +threads, so `Send + Sync` is load-bearing. The context is short-lived: a `&dyn` +reference passed for the duration of one synchronous `plan()` call, so the bound would +only restrict implementations without adding safety. Notably, core's `PhysicalPlanner` +carries JNI handles that aren't `Send`; requiring `Send` on the context would force an +awkward `Arc>` dance for no gain. + +Contribs that want to spawn async work during `plan()` must capture only the +`Arc` (which **is** `Send + Sync`) before crossing a thread boundary — +not the `&dyn ContribPlannerContext` itself. + +### Why `payload: &[u8]` instead of `Bytes` + +The dispatcher already owns the decoded `ContribOp` proto; passing `&[u8]` is zero-copy +and avoids forcing every contrib to depend on the `bytes` crate. `prost::Message::decode` +accepts `&[u8]` directly. Contribs that want `Bytes` for downstream zero-copy work can +convert with `bytes::Bytes::copy_from_slice(payload)` — a single allocation, at most +once per plan call. + +### `ContribError::WrongChildCount` convention + +`expected` is a free-form human description; conventionally a phrase like `"exactly 1"` +or `"0 or 1"` so the displayed error reads: +`wrong child count: expected exactly 1, got 2`. ## Required files (mirror `contrib/example/` exactly) @@ -150,23 +208,78 @@ Plus three edits to existing files: `Arc`. 7. Core wraps the result in a `SparkPlan` and continues planning. +## `#[ctor]` registration: panic safety + logging + +The contrib's native crate registers its planners during library init via +`#[ctor::ctor]`. Two important quirks to get right: + +**Panics in `#[ctor]` abort the JVM process** before `JNI_OnLoad` runs, with no +diagnostic on macOS/Linux. Wrap every ctor body in `std::panic::catch_unwind` and emit +a stderr message on failure: + +```rust +#[ctor::ctor] +fn register() { + let _ = std::panic::catch_unwind(|| { + register_contrib_planner(MY_KIND, Arc::new(MyPlanner)); + }) + .map_err(|panic| { + eprintln!("comet-contrib-myname: #[ctor] panicked: {panic:?}"); + }); +} +``` + +**`log::*!` macros inside `#[ctor]` are no-ops.** Comet's logger is initialised later, +in `Java_org_apache_comet_NativeBase_init`. Any diagnostic you need from the ctor body +must go through `eprintln!`. The example contrib follows both patterns. + +**Cross-platform caveats.** `#[ctor::ctor]` works on Linux / macOS / Windows MSVC, but +the order of ctor execution across rlibs is link-order dependent and not guaranteed +across compiler versions. Your contrib's ctor **MUST NOT** depend on another contrib +already being registered. + ## Cargo feature gate Each contrib's native rlib is wired into core via a feature flag. Build core with: ```bash -# Default release build: all in-tree contribs enabled (contrib-example, future ones too) +# Default release build: zero contrib surface. registered_contrib_kinds() is empty. cargo build -# Slim build: zero contrib code in libcomet +# Enable a specific contrib explicitly: +cargo build --features contrib-example +# or +cargo build --features contrib-example,contrib-delta + +# Verify the slim build path: cargo build --no-default-features ``` +`registered_contrib_kinds()` in a default release build is empty — production +deployments only see the contribs they explicitly opted into. CI matrix should include +a `--no-default-features` row to catch any accidental contrib leakage into core. + The JVM side is **always** conditional: the contrib JAR is its own artifact, and Spark -only picks it up when it's on the classpath. So even with the Cargo feature on, a user +only picks it up when it's on the classpath. Even with the Cargo feature on, a user who doesn't add the contrib JAR sees no behaviour change — the contrib's native planner sits dormant in the registry, waiting for a JVM serde that never calls it. +## Maven JAR packaging + +The example contrib ships a thin JAR (no shading). Real contribs SHOULD prefer thin +JARs too. If your contrib must include a third-party library that conflicts with core's +classpath (e.g., a different protobuf-java version), shade the conflicting classes +under your contrib's package prefix (`org.apache.comet.contrib..shaded.*`) so +classloader collisions stay local. Do not shade `comet-spark` or its transitive +dependencies — those are `provided` scope and the user supplies them. + +## Registry implementation note + +The native contrib planner registry is currently a `RwLock>>`. +Lookups happen once per `ContribOp` plan call; writes happen only during library init. +The implementation may switch to a lock-free primitive (`ArcSwap`) in a future release +if profiling shows the read path matters; the public API stays unchanged either way. + ## Testing `contrib/example/`'s test suite demonstrates the recommended pattern: @@ -189,8 +302,6 @@ test fixture, so PR1's CI doubles as smoke coverage for any future contribs. ## See also -- [`docs/contrib-delta-migration-plan.md`](../../../contrib-delta-migration-plan.md) — - the architectural rationale + the two-PR plan that introduced the SPI. - [`contrib/example/`](https://github.com/apache/datafusion-comet/tree/main/contrib/example) — the worked reference. - [`native/contrib-spi/`](https://github.com/apache/datafusion-comet/tree/main/native/contrib-spi) — diff --git a/native/Cargo.lock b/native/Cargo.lock index 6e4ec5e6f7..289d1ff095 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -228,60 +228,25 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "arrow" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd47f2a6ddc39244bd722a27ee5da66c03369d087b9e024eafdb03e98b98ea7" -dependencies = [ - "arrow-arith 57.3.1", - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-cast 57.3.1", - "arrow-csv 57.3.1", - "arrow-data 57.3.1", - "arrow-ipc 57.3.1", - "arrow-json 57.3.1", - "arrow-ord 57.3.1", - "arrow-row 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "arrow-string 57.3.1", -] - [[package]] name = "arrow" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "607e64bb911ee4f90483e044fe78f175989148c2892e659a2cd25429e782ec54" dependencies = [ - "arrow-arith 58.2.0", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-cast 58.2.0", - "arrow-csv 58.2.0", - "arrow-data 58.2.0", - "arrow-ipc 58.2.0", - "arrow-json 58.2.0", - "arrow-ord 58.2.0", - "arrow-row 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", - "arrow-string 58.2.0", -] - -[[package]] -name = "arrow-arith" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7bbd679c5418b8639b92be01f361d60013c4906574b578b77b63c78356594c" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "chrono", - "num-traits", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", ] [[package]] @@ -290,33 +255,14 @@ version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e754319ed8a85d817fe7adf183227e0b5308b82790a737b426c1124626b48118" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "num-traits", ] -[[package]] -name = "arrow-array" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8a4ab47b3f3eac60f7fd31b81e9028fda018607bcc63451aca4f2b755269862" -dependencies = [ - "ahash", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "chrono", - "chrono-tz", - "half", - "hashbrown 0.16.1", - "num-complex", - "num-integer", - "num-traits", -] - [[package]] name = "arrow-array" version = "58.2.0" @@ -324,9 +270,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841321891f247aa86c6112c80d83d89cb36e0addd020fa2425085b8eb6c3f579" dependencies = [ "ahash", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "chrono-tz", "half", @@ -336,18 +282,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-buffer" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d18b89b4c4f4811d0858175e79541fe98e33e18db3b011708bc287b1240593f" -dependencies = [ - "bytes", - "half", - "num-bigint", - "num-traits", -] - [[package]] name = "arrow-buffer" version = "58.2.0" @@ -360,40 +294,18 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-cast" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "722b5c41dd1d14d0a879a1bce92c6fe33f546101bb2acce57a209825edd075b3" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-ord 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "atoi", - "base64", - "chrono", - "comfy-table", - "half", - "lexical-core", - "num-traits", - "ryu", -] - [[package]] name = "arrow-cast" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca5e686972523798f76bef355145bc1ae25a84c731e650268d31ab763c701663" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-ord 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", "atoi", "base64", "chrono", @@ -404,113 +316,47 @@ dependencies = [ "ryu", ] -[[package]] -name = "arrow-csv" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ddb80a4848e03b1655af496d5ac2563a779e5742fcb48f2ca2e089c9cd2197" -dependencies = [ - "arrow-array 57.3.1", - "arrow-cast 57.3.1", - "arrow-schema 57.3.1", - "chrono", - "csv", - "csv-core", - "regex", -] - [[package]] name = "arrow-csv" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86c276756867fc8186ec380c72c290e6e3b23a1d4fb05df6b1d62d2e62666d48" dependencies = [ - "arrow-array 58.2.0", - "arrow-cast 58.2.0", - "arrow-schema 58.2.0", + "arrow-array", + "arrow-cast", + "arrow-schema", "chrono", "csv", "csv-core", "regex", ] -[[package]] -name = "arrow-data" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1683705c63dcf0d18972759eda48489028cbbff67af7d6bef2c6b7b74ab778a" -dependencies = [ - "arrow-buffer 57.3.1", - "arrow-schema 57.3.1", - "half", - "num-integer", - "num-traits", -] - [[package]] name = "arrow-data" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db3b5846209775b6dc8056d77ff9a032b27043383dd5488abd0b663e265b9373" dependencies = [ - "arrow-buffer 58.2.0", - "arrow-schema 58.2.0", + "arrow-buffer", + "arrow-schema", "half", "num-integer", "num-traits", ] -[[package]] -name = "arrow-ipc" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cf72d04c07229fbf4dbebe7145cac37d7cf7ec582fe705c6b92cb314af096ab" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "flatbuffers", -] - [[package]] name = "arrow-ipc" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd8907ddd8f9fbabf91ec2c85c1d81fe2874e336d2443eb36373595e28b98dd5" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "flatbuffers", - "lz4_flex 0.13.0", -] - -[[package]] -name = "arrow-json" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a84a905f41fedfcd7679813c89a61dc369c0f932b27aa8dcc6aa051cc781a97d" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-cast 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "chrono", - "half", - "indexmap 2.14.0", - "itoa", - "lexical-core", - "memchr", - "num-traits", - "ryu", - "serde_core", - "serde_json", - "simdutf8", + "lz4_flex", ] [[package]] @@ -519,12 +365,12 @@ version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4518c59acc501f10d7dcae397fe12b8db3d81bc7de94456f8a58f9165d6f502" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-cast 58.2.0", - "arrow-ord 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", "chrono", "half", "indexmap 2.14.0", @@ -538,43 +384,17 @@ dependencies = [ "simdutf8", ] -[[package]] -name = "arrow-ord" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "082342947d4e5a2bcccf029a0a0397e21cb3bb8421edd9571d34fb5dd2670256" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", -] - [[package]] name = "arrow-ord" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efa70d9d6b1356f1fb9f1f651b84a725b7e0abb93f188cf7d31f14abfa2f2e6f" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", -] - -[[package]] -name = "arrow-row" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3a931b520a2a5e22033e01a6f2486b4cdc26f9106b759abeebc320f125e94d7" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "half", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] [[package]] @@ -583,22 +403,13 @@ version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faec88a945338192beffbbd4be0def70135422930caa244ac3cec0cd213b26b4" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", ] -[[package]] -name = "arrow-schema" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4cf0d4a6609679e03002167a61074a21d7b1ad9ea65e462b2c0a97f8a3b2bc6" -dependencies = [ - "bitflags 2.11.1", -] - [[package]] name = "arrow-schema" version = "58.2.0" @@ -610,20 +421,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "arrow-select" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b320d86a9806923663bb0fd9baa65ecaba81cb0cd77ff8c1768b9716b4ef891" -dependencies = [ - "ahash", - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "num-traits", -] - [[package]] name = "arrow-select" version = "58.2.0" @@ -631,41 +428,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a657ab5132e9c8ca3b24eb15a823d0ced38017fe3930ff50167466b02e2d592c" dependencies = [ "ahash", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num-traits", ] -[[package]] -name = "arrow-string" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b493e99162e5764077e7823e50ba284858d365922631c7aaefe9487b1abd02c2" -dependencies = [ - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-data 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "memchr", - "num-traits", - "regex", - "regex-syntax", -] - [[package]] name = "arrow-string" version = "58.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6de2efbbd1a9f9780ceb8d1ff5d20421b35863b361e3386b4f571f1fc69fcb8" dependencies = [ - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "memchr", "num-traits", "regex", @@ -1705,34 +1485,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "comet-contrib-delta" -version = "0.17.0" -dependencies = [ - "arrow 58.2.0", - "chrono", - "chrono-tz", - "comet-contrib-spi", - "ctor 0.4.3", - "datafusion", - "datafusion-comet-jni-bridge", - "datafusion-comet-proto", - "delta_kernel", - "futures", - "jni 0.22.4", - "log", - "object_store 0.12.5", - "object_store 0.13.2", - "parquet 58.1.0", - "prost", - "prost-build", - "roaring 0.10.12", - "tempfile", - "thiserror 2.0.18", - "tokio", - "url", -] - [[package]] name = "comet-contrib-example" version = "0.17.0" @@ -1753,6 +1505,7 @@ dependencies = [ "datafusion", "datafusion-comet-proto", "log", + "object_store", ] [[package]] @@ -1761,7 +1514,6 @@ version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ - "crossterm", "unicode-segmentation", "unicode-width", ] @@ -1877,21 +1629,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "3.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" -dependencies = [ - "crc-catalog", -] - -[[package]] -name = "crc-catalog" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" - [[package]] name = "crc32c" version = "0.6.8" @@ -1979,29 +1716,6 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" -[[package]] -name = "crossterm" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" -dependencies = [ - "bitflags 2.11.1", - "crossterm_winapi", - "document-features", - "parking_lot", - "rustix 1.1.4", - "winapi", -] - -[[package]] -name = "crossterm_winapi" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" -dependencies = [ - "winapi", -] - [[package]] name = "crunchy" version = "0.2.4" @@ -2188,8 +1902,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ - "arrow 58.2.0", - "arrow-schema 58.2.0", + "arrow", + "arrow-schema", "async-trait", "bytes", "chrono", @@ -2221,9 +1935,9 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.13.2", + "object_store", "parking_lot", - "parquet 58.1.0", + "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -2239,7 +1953,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "dashmap", "datafusion-common", @@ -2253,7 +1967,7 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.13.2", + "object_store", "parking_lot", "tokio", ] @@ -2264,7 +1978,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2278,20 +1992,19 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.13.2", + "object_store", ] [[package]] name = "datafusion-comet" version = "0.17.0" dependencies = [ - "arrow 58.2.0", + "arrow", "assertables", "async-trait", "aws-config", "aws-credential-types", "bytes", - "comet-contrib-delta", "comet-contrib-example", "comet-contrib-spi", "criterion", @@ -2319,12 +2032,12 @@ dependencies = [ "log4rs", "mimalloc", "num", - "object_store 0.13.2", + "object_store", "object_store_opendal", "once_cell", "opendal 0.56.0", "parking_lot", - "parquet 58.1.0", + "parquet", "paste", "pprof", "procfs", @@ -2344,7 +2057,7 @@ dependencies = [ name = "datafusion-comet-common" version = "0.17.0" dependencies = [ - "arrow 58.2.0", + "arrow", "datafusion", "serde", "serde_json", @@ -2370,14 +2083,14 @@ dependencies = [ name = "datafusion-comet-jni-bridge" version = "0.17.0" dependencies = [ - "arrow 58.2.0", + "arrow", "assertables", "datafusion", "datafusion-comet-common", "jni 0.22.4", "lazy_static", "once_cell", - "parquet 58.1.0", + "parquet", "paste", "prost", "regex", @@ -2394,7 +2107,7 @@ dependencies = [ "datafusion-comet-fs-hdfs3", "fs-hdfs3", "futures", - "object_store 0.13.2", + "object_store", "tokio", ] @@ -2410,7 +2123,7 @@ dependencies = [ name = "datafusion-comet-shuffle" version = "0.17.0" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "bytes", "clap", @@ -2425,8 +2138,8 @@ dependencies = [ "itertools 0.14.0", "jni 0.21.1", "log", - "lz4_flex 0.13.0", - "parquet 58.1.0", + "lz4_flex", + "parquet", "simd-adler32", "snap", "tempfile", @@ -2438,7 +2151,7 @@ dependencies = [ name = "datafusion-comet-spark-expr" version = "0.17.0" dependencies = [ - "arrow 58.2.0", + "arrow", "base64", "chrono", "chrono-tz", @@ -2464,8 +2177,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", - "arrow 58.2.0", - "arrow-ipc 58.2.0", + "arrow", + "arrow-ipc", "chrono", "half", "hashbrown 0.16.1", @@ -2474,8 +2187,8 @@ dependencies = [ "itertools 0.14.0", "libc", "log", - "object_store 0.13.2", - "parquet 58.1.0", + "object_store", + "parquet", "paste", "sqlparser", "tokio", @@ -2499,7 +2212,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ - "arrow 58.2.0", + "arrow", "async-compression", "async-trait", "bytes", @@ -2520,7 +2233,7 @@ dependencies = [ "itertools 0.14.0", "liblzma", "log", - "object_store 0.13.2", + "object_store", "rand 0.9.4", "tokio", "tokio-util", @@ -2534,8 +2247,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ - "arrow 58.2.0", - "arrow-ipc 58.2.0", + "arrow", + "arrow-ipc", "async-trait", "bytes", "datafusion-common", @@ -2548,7 +2261,7 @@ dependencies = [ "datafusion-session", "futures", "itertools 0.14.0", - "object_store 0.13.2", + "object_store", "tokio", ] @@ -2558,7 +2271,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2570,7 +2283,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store 0.13.2", + "object_store", "regex", "tokio", ] @@ -2581,7 +2294,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2593,7 +2306,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "object_store 0.13.2", + "object_store", "serde_json", "tokio", "tokio-stream", @@ -2605,7 +2318,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "bytes", "datafusion-common", @@ -2623,9 +2336,9 @@ dependencies = [ "futures", "itertools 0.14.0", "log", - "object_store 0.13.2", + "object_store", "parking_lot", - "parquet 58.1.0", + "parquet", "tokio", ] @@ -2641,8 +2354,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ - "arrow 58.2.0", - "arrow-buffer 58.2.0", + "arrow", + "arrow-buffer", "async-trait", "chrono", "dashmap", @@ -2651,9 +2364,9 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "log", - "object_store 0.13.2", + "object_store", "parking_lot", - "parquet 58.1.0", + "parquet", "rand 0.9.4", "tempfile", "url", @@ -2665,7 +2378,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "chrono", "datafusion-common", @@ -2687,7 +2400,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ - "arrow 58.2.0", + "arrow", "datafusion-common", "indexmap 2.14.0", "itertools 0.14.0", @@ -2700,8 +2413,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ - "arrow 58.2.0", - "arrow-buffer 58.2.0", + "arrow", + "arrow-buffer", "base64", "blake2", "blake3", @@ -2733,7 +2446,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2755,7 +2468,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", @@ -2767,8 +2480,8 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ - "arrow 58.2.0", - "arrow-ord 58.2.0", + "arrow", + "arrow-ord", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2792,7 +2505,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ - "arrow 58.2.0", + "arrow", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2808,7 +2521,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -2847,7 +2560,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ - "arrow 58.2.0", + "arrow", "chrono", "datafusion-common", "datafusion-expr", @@ -2867,7 +2580,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -2889,7 +2602,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-expr", "datafusion-functions", @@ -2905,7 +2618,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", - "arrow 58.2.0", + "arrow", "chrono", "datafusion-common", "datafusion-expr-common", @@ -2921,7 +2634,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -2940,9 +2653,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", - "arrow 58.2.0", - "arrow-ord 58.2.0", - "arrow-schema 58.2.0", + "arrow", + "arrow-ord", + "arrow-schema", "async-trait", "datafusion-common", "datafusion-common-runtime", @@ -2971,7 +2684,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ - "arrow 58.2.0", + "arrow", "datafusion-common", "datafusion-datasource", "datafusion-expr-common", @@ -3002,7 +2715,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" dependencies = [ - "arrow 58.2.0", + "arrow", "bigdecimal", "chrono", "crc32fast", @@ -3029,7 +2742,7 @@ version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ - "arrow 58.2.0", + "arrow", "bigdecimal", "chrono", "datafusion-common", @@ -3050,48 +2763,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "delta_kernel" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f7fc164b1557731fcc68a198e813811a000efade0f112d4f0a002e65042b83" -dependencies = [ - "arrow 57.3.1", - "bytes", - "chrono", - "comfy-table", - "crc", - "delta_kernel_derive", - "futures", - "indexmap 2.14.0", - "itertools 0.14.0", - "object_store 0.12.5", - "parquet 57.3.1", - "reqwest 0.12.28", - "roaring 0.11.3", - "rustc_version", - "serde", - "serde_json", - "strum", - "thiserror 2.0.18", - "tokio", - "tracing", - "url", - "uuid", - "z85", -] - -[[package]] -name = "delta_kernel_derive" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86815a2c475835751ffa9b8d9ac8ed86cf86294304c42bedd1103d54f25ecbfe" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "der" version = "0.7.10" @@ -3222,15 +2893,6 @@ dependencies = [ "const-random", ] -[[package]] -name = "document-features" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" -dependencies = [ - "litrs", -] - [[package]] name = "dtor" version = "0.0.6" @@ -3967,14 +3629,14 @@ dependencies = [ "anyhow", "apache-avro", "array-init", - "arrow-arith 58.2.0", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-cast 58.2.0", - "arrow-ord 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", - "arrow-string 58.2.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", + "arrow-string", "as-any", "async-trait", "backon", @@ -3993,10 +3655,10 @@ dependencies = [ "murmur3", "once_cell", "ordered-float 4.6.0", - "parquet 58.1.0", + "parquet", "rand 0.9.4", "reqwest 0.12.28", - "roaring 0.11.3", + "roaring", "serde", "serde_bytes", "serde_derive", @@ -4597,12 +4259,6 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" -[[package]] -name = "litrs" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" - [[package]] name = "lock_api" version = "0.4.14" @@ -4663,15 +4319,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" -[[package]] -name = "lz4_flex" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90071f8077f8e40adfc4b7fe9cd495ce316263f19e75c2211eeff3fdf475a3d9" -dependencies = [ - "twox-hash", -] - [[package]] name = "lz4_flex" version = "0.13.0" @@ -4926,44 +4573,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "object_store" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" -dependencies = [ - "async-trait", - "base64", - "bytes", - "chrono", - "form_urlencoded", - "futures", - "http 1.4.0", - "http-body-util", - "httparse", - "humantime", - "hyper", - "itertools 0.14.0", - "md-5", - "parking_lot", - "percent-encoding", - "quick-xml 0.38.4", - "rand 0.9.4", - "reqwest 0.12.28", - "ring", - "rustls-pemfile", - "serde", - "serde_json", - "serde_urlencoded", - "thiserror 2.0.18", - "tokio", - "tracing", - "url", - "walkdir", - "wasm-bindgen-futures", - "web-time", -] - [[package]] name = "object_store" version = "0.13.2" @@ -5015,7 +4624,7 @@ dependencies = [ "chrono", "futures", "mea", - "object_store 0.13.2", + "object_store", "opendal 0.56.0", "pin-project", "tokio", @@ -5244,43 +4853,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "57.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e832c6aa20310fc6de7ea5a3f4e20d34fd83e3b43229d32b81ffe5c14d74692" -dependencies = [ - "ahash", - "arrow-array 57.3.1", - "arrow-buffer 57.3.1", - "arrow-cast 57.3.1", - "arrow-data 57.3.1", - "arrow-ipc 57.3.1", - "arrow-schema 57.3.1", - "arrow-select 57.3.1", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.16.1", - "lz4_flex 0.12.2", - "num-bigint", - "num-integer", - "num-traits", - "object_store 0.12.5", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - [[package]] name = "parquet" version = "58.1.0" @@ -5288,12 +4860,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" dependencies = [ "ahash", - "arrow-array 58.2.0", - "arrow-buffer 58.2.0", - "arrow-data 58.2.0", - "arrow-ipc 58.2.0", - "arrow-schema 58.2.0", - "arrow-select 58.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", "base64", "brotli", "bytes", @@ -5302,11 +4874,11 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex 0.13.0", + "lz4_flex", "num-bigint", "num-integer", "num-traits", - "object_store 0.13.2", + "object_store", "parquet-variant", "parquet-variant-compute", "parquet-variant-json", @@ -5327,7 +4899,7 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf493f3c9ddd984d0efb019f67343e4aa4bab893931f6a14b82083065dc3d28" dependencies = [ - "arrow-schema 58.2.0", + "arrow-schema", "chrono", "half", "indexmap 2.14.0", @@ -5341,8 +4913,8 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac038d46a503a7d563b4f5df5802c4315d5343d009feab195d15ac512b4cb27" dependencies = [ - "arrow 58.2.0", - "arrow-schema 58.2.0", + "arrow", + "arrow-schema", "chrono", "half", "indexmap 2.14.0", @@ -5358,7 +4930,7 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "015a09c2ffe5108766c7c1235c307b8a3c2ea64eca38455ba1a7f3a7f32f16e2" dependencies = [ - "arrow-schema 58.2.0", + "arrow-schema", "base64", "chrono", "parquet-variant", @@ -6179,16 +5751,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "roaring" -version = "0.10.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b" -dependencies = [ - "bytemuck", - "byteorder", -] - [[package]] name = "roaring" version = "0.11.3" @@ -6310,15 +5872,6 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" version = "1.14.0" @@ -7206,7 +6759,6 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ - "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -7405,7 +6957,6 @@ checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", - "rand 0.10.1", "serde_core", "wasm-bindgen", ] @@ -8080,12 +7631,6 @@ dependencies = [ "synstructure", ] -[[package]] -name = "z85" -version = "3.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6e61e59a957b7ccee15d2049f86e8bfd6f66968fcd88f018950662d9b86e675" - [[package]] name = "zerocopy" version = "0.8.48" diff --git a/native/contrib-spi/Cargo.toml b/native/contrib-spi/Cargo.toml index 08e1b2662a..29fde99b5c 100644 --- a/native/contrib-spi/Cargo.toml +++ b/native/contrib-spi/Cargo.toml @@ -29,4 +29,13 @@ edition = { workspace = true } # Public types in the SPI reference these crates. Pinning matches core via workspace. datafusion = { workspace = true } datafusion-comet-proto = { workspace = true } +# Surface the `Path` type on the SPI's prepare_object_store return value. +object_store = { workspace = true } log = "0.4" + +[features] +# Off by default. When enabled, the crate exposes `ScopedContribPlannerRegistration` and +# `_clear_for_test` for downstream test code that needs to register planners without +# polluting the process-wide registry. The same surfaces are unconditionally available +# under `#[cfg(test)]` for the SPI's own unit tests. +test-utils = [] diff --git a/native/contrib-spi/src/lib.rs b/native/contrib-spi/src/lib.rs index f40f7bbe2c..9f6bf4959c 100644 --- a/native/contrib-spi/src/lib.rs +++ b/native/contrib-spi/src/lib.rs @@ -80,9 +80,10 @@ pub trait ContribOperatorPlanner: Send + Sync { /// core's internal `init_datasource_exec` signature one-to-one, so the trait method is a /// thin forward. /// -/// Held by value rather than `&self`/builder pattern because contribs build it once per -/// plan call -- the verbose layout is easier to read at the call site than a builder -/// would be. +/// `#[non_exhaustive]` so adding fields in future is a minor SemVer bump, not a break. +/// Contribs construct via [`ParquetDatasourceParams::new`] (required fields only) + +/// `with_*` builder setters; never by struct-literal syntax. +#[non_exhaustive] pub struct ParquetDatasourceParams<'a> { pub required_schema: SchemaRef, pub data_schema: Option, @@ -100,6 +101,79 @@ pub struct ParquetDatasourceParams<'a> { pub ignore_missing_field_id: bool, } +impl<'a> ParquetDatasourceParams<'a> { + /// Minimal constructor with the parameters every parquet scan needs. All `Option`s + /// default to `None`, all `bool`s to `false`, and `session_timezone` to `"UTC"`. Use + /// the `with_*` setters to populate the rest. + pub fn new( + required_schema: SchemaRef, + object_store_url: ObjectStoreUrl, + file_groups: Vec>, + ) -> Self { + Self { + required_schema, + data_schema: None, + partition_schema: None, + object_store_url, + file_groups, + projection_vector: None, + data_filters: None, + default_values: None, + session_timezone: "UTC", + case_sensitive: false, + return_null_struct_if_all_fields_missing: false, + encryption_enabled: false, + use_field_id: false, + ignore_missing_field_id: false, + } + } + + pub fn with_data_schema(mut self, schema: SchemaRef) -> Self { + self.data_schema = Some(schema); + self + } + pub fn with_partition_schema(mut self, schema: SchemaRef) -> Self { + self.partition_schema = Some(schema); + self + } + pub fn with_projection_vector(mut self, projection: Vec) -> Self { + self.projection_vector = Some(projection); + self + } + pub fn with_data_filters(mut self, filters: Vec>) -> Self { + self.data_filters = Some(filters); + self + } + pub fn with_default_values(mut self, values: HashMap) -> Self { + self.default_values = Some(values); + self + } + pub fn with_session_timezone(mut self, tz: &'a str) -> Self { + self.session_timezone = tz; + self + } + pub fn with_case_sensitive(mut self, b: bool) -> Self { + self.case_sensitive = b; + self + } + pub fn with_return_null_struct_if_all_fields_missing(mut self, b: bool) -> Self { + self.return_null_struct_if_all_fields_missing = b; + self + } + pub fn with_encryption_enabled(mut self, b: bool) -> Self { + self.encryption_enabled = b; + self + } + pub fn with_use_field_id(mut self, b: bool) -> Self { + self.use_field_id = b; + self + } + pub fn with_ignore_missing_field_id(mut self, b: bool) -> Self { + self.ignore_missing_field_id = b; + self + } +} + /// Planner services exposed by core to contribs. Core implements this trait against its /// `PhysicalPlanner` + `SessionContext`; contribs receive a `&dyn ContribPlannerContext` /// in their [`ContribOperatorPlanner::plan`] call and reach into core through it. @@ -134,13 +208,15 @@ pub trait ContribPlannerContext { fn convert_spark_schema(&self, fields: &[spark_operator::SparkStructField]) -> SchemaRef; /// Register an object store on the runtime env for the given URL's scheme + bucket, - /// using `object_store_configs` for credentials / endpoint overrides. Returns the - /// canonical `ObjectStoreUrl` that the contrib should attach to its `PartitionedFile`s. + /// using `object_store_configs` for credentials / endpoint overrides. Returns + /// `(ObjectStoreUrl, Path)`: the URL the contrib attaches to its `PartitionedFile`s, + /// and the canonical path within that store (caller may discard if not needed -- + /// most file-scan contribs use it to set `partitioned_file.object_meta.location`). fn prepare_object_store( &self, any_file_url: String, object_store_configs: &HashMap, - ) -> Result; + ) -> Result<(ObjectStoreUrl, object_store::path::Path), ContribError>; /// Build a `DataSourceExec` over Comet's tuned `ParquetSource`. This is the single /// most important method on the trait -- every file-scan contrib (Delta, Iceberg) @@ -155,6 +231,10 @@ pub trait ContribPlannerContext { /// [`ContribPlannerContext`]. Kept distinct from core's `ExecutionError` so this crate /// stays free of core's dependency tree. Core converts `ContribError` into its own /// `ExecutionError` at the dispatch site. +/// +/// `#[non_exhaustive]` so adding variants in the future is a minor SemVer bump, not a +/// break. Pattern matchers in contribs MUST include a wildcard arm. +#[non_exhaustive] #[derive(Debug)] pub enum ContribError { /// Generic failure. Use this for cases that don't fit the more specific variants. @@ -162,12 +242,10 @@ pub enum ContribError { /// The contrib received a payload it couldn't decode (wrong proto schema, missing /// required field, etc.). BadPayload(String), - /// The contrib received a child count it can't handle (e.g. a binary operator wired - /// to one child). - WrongChildCount { - expected: &'static str, - actual: usize, - }, + /// The contrib received a child count it can't handle. `expected` is a free-form + /// human description, conventionally a phrase like `"exactly 1"` or `"0 or 1"` so + /// the error message reads `wrong child count: expected exactly 1, got 2`. + WrongChildCount { expected: String, actual: usize }, } impl std::fmt::Display for ContribError { @@ -178,6 +256,10 @@ impl std::fmt::Display for ContribError { ContribError::WrongChildCount { expected, actual } => { write!(f, "wrong child count: expected {expected}, got {actual}") } + // Wildcard arm so the match stays exhaustive after future #[non_exhaustive] + // additions. Reached only by `_` constructors that don't exist today. + #[allow(unreachable_patterns)] + _ => write!(f, "unknown contrib error"), } } } @@ -230,6 +312,61 @@ pub fn registered_contrib_kinds() -> Vec { kinds } +/// RAII guard that registers a planner for the lifetime of the guard and removes it on +/// drop. Use in tests that want a planner registered without polluting the process +/// registry for other tests running in parallel. +/// +/// Not `Send` because dropping it requires the registry write lock; tests using this +/// guard should mark themselves `#[serial_test::serial]` if they assert on +/// `registered_contrib_kinds()` (whose snapshot is affected by other threads' guards). +#[cfg(any(test, feature = "test-utils"))] +pub struct ScopedContribPlannerRegistration { + kind: String, + previous: Option>, +} + +#[cfg(any(test, feature = "test-utils"))] +impl ScopedContribPlannerRegistration { + /// Install `planner` under `kind` for the lifetime of the returned guard. The + /// previously-registered planner (if any) is restored on drop. + pub fn new(kind: impl Into, planner: Arc) -> Self { + let kind = kind.into(); + let mut guard = registry() + .write() + .expect("contrib planner registry poisoned"); + let previous = guard.insert(kind.clone(), planner); + Self { kind, previous } + } +} + +#[cfg(any(test, feature = "test-utils"))] +impl Drop for ScopedContribPlannerRegistration { + fn drop(&mut self) { + let mut guard = registry() + .write() + .expect("contrib planner registry poisoned"); + match self.previous.take() { + Some(prev) => { + guard.insert(self.kind.clone(), prev); + } + None => { + guard.remove(&self.kind); + } + } + } +} + +/// Clear the registry. **Test-only escape hatch.** Use [`ScopedContribPlannerRegistration`] +/// instead in any test that runs in parallel with other registry users -- this function +/// removes the entries every other concurrent test depends on. +#[cfg(any(test, feature = "test-utils"))] +pub fn _clear_for_test() { + let mut guard = registry() + .write() + .expect("contrib planner registry poisoned"); + guard.clear(); +} + #[cfg(test)] mod tests { use super::*; @@ -249,15 +386,127 @@ mod tests { } } + // Use globally-unique kinds so concurrent tests in the same binary don't collide + // through the process-wide registry. The `_test_` prefix is reserved for unit tests. + + #[test] + fn unknown_kind_returns_none() { + // Independent of any registrations: a kind no one ever registers stays None. + let probe = "_test_definitely_unregistered_a8f3c1e"; + assert!(lookup_contrib_planner_by_kind(probe).is_none()); + } + + #[test] + fn scoped_registration_round_trip() { + let kind = "_test_scoped_registration_a"; + assert!(lookup_contrib_planner_by_kind(kind).is_none()); + { + let _guard = ScopedContribPlannerRegistration::new(kind, Arc::new(AlwaysEmpty)); + assert!(lookup_contrib_planner_by_kind(kind).is_some()); + } + // Dropping the guard removes the entry. + assert!(lookup_contrib_planner_by_kind(kind).is_none()); + } + + #[test] + fn scoped_registration_restores_previous() { + let kind = "_test_scoped_registration_b"; + let _outer = + ScopedContribPlannerRegistration::new(kind, Arc::new(AlwaysEmpty)); + { + // Inner guard temporarily replaces the outer planner; drop restores outer. + let _inner = + ScopedContribPlannerRegistration::new(kind, Arc::new(AlwaysEmpty)); + assert!(lookup_contrib_planner_by_kind(kind).is_some()); + } + assert!(lookup_contrib_planner_by_kind(kind).is_some()); + } + #[test] - fn register_and_lookup() { - register_contrib_planner("test-spi-kind-a", Arc::new(AlwaysEmpty)); - register_contrib_planner("test-spi-kind-b", Arc::new(AlwaysEmpty)); - assert!(lookup_contrib_planner_by_kind("test-spi-kind-a").is_some()); - assert!(lookup_contrib_planner_by_kind("test-spi-kind-b").is_some()); - assert!(lookup_contrib_planner_by_kind("test-spi-kind-c").is_none()); + fn parquet_datasource_params_constructor_defaults() { + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::execution::object_store::ObjectStoreUrl; + + let schema: SchemaRef = Arc::new(Schema::new(vec![Field::new( + "id", + DataType::Int64, + false, + )])); + let url = ObjectStoreUrl::parse("file://").unwrap(); + let p = ParquetDatasourceParams::new(Arc::clone(&schema), url, vec![]); + + assert_eq!(p.required_schema.fields().len(), 1); + assert!(p.data_schema.is_none()); + assert!(p.partition_schema.is_none()); + assert!(p.projection_vector.is_none()); + assert!(p.data_filters.is_none()); + assert!(p.default_values.is_none()); + assert_eq!(p.session_timezone, "UTC"); + assert!(!p.case_sensitive); + assert!(!p.return_null_struct_if_all_fields_missing); + assert!(!p.encryption_enabled); + assert!(!p.use_field_id); + assert!(!p.ignore_missing_field_id); + } + + #[test] + fn parquet_datasource_params_setters_apply() { + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::execution::object_store::ObjectStoreUrl; + + let schema: SchemaRef = Arc::new(Schema::new(vec![Field::new( + "id", + DataType::Int64, + false, + )])); + let url = ObjectStoreUrl::parse("file://").unwrap(); + let p = ParquetDatasourceParams::new(Arc::clone(&schema), url, vec![]) + .with_data_schema(Arc::clone(&schema)) + .with_session_timezone("America/Los_Angeles") + .with_case_sensitive(true) + .with_use_field_id(true) + .with_ignore_missing_field_id(true) + .with_encryption_enabled(true); + + // Distinguishable bool tuple: a swap in `init_datasource_exec`'s arg order + // would fail this assertion in core's planner::contrib tests. + assert_eq!(p.session_timezone, "America/Los_Angeles"); + assert!(p.case_sensitive); + assert!(!p.return_null_struct_if_all_fields_missing); + assert!(p.encryption_enabled); + assert!(p.use_field_id); + assert!(p.ignore_missing_field_id); + assert!(p.data_schema.is_some()); + } + + #[test] + fn contrib_error_display_preserves_variant_info() { + // The dispatcher wraps `e` via Display: `format!("contrib planner {kind:?}: {e}")`. + // These cases assert each variant's discriminating info survives that path. + let plan = ContribError::Plan("plan-context-message".into()).to_string(); + assert!(plan.contains("plan-context-message")); + + let bad = ContribError::BadPayload("decoding failed at field 7".into()).to_string(); + assert!(bad.starts_with("bad payload: ")); + assert!(bad.contains("decoding failed at field 7")); + + let wcc = ContribError::WrongChildCount { + expected: "exactly 1".into(), + actual: 3, + } + .to_string(); + assert!(wcc.contains("exactly 1")); + assert!(wcc.contains("got 3")); + } + + #[test] + fn registered_contrib_kinds_reflects_current_state() { + let kind = "_test_kinds_snapshot_only"; + let _guard = ScopedContribPlannerRegistration::new(kind, Arc::new(AlwaysEmpty)); let kinds = registered_contrib_kinds(); - assert!(kinds.contains(&"test-spi-kind-a".to_string())); - assert!(kinds.contains(&"test-spi-kind-b".to_string())); + assert!( + kinds.iter().any(|k| k == kind), + "expected snapshot to include {kind:?}, got {kinds:?}" + ); } } diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index b1bb2d30b7..3fdab96398 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -100,10 +100,12 @@ datafusion-functions-nested = { version = "53.1.0" } [features] backtrace = ["datafusion/backtrace"] -# `contrib-example` is on by default so released builds ship the example contrib's -# planner registered, and the worked-reference test in contrib/example exercises it. -# `cargo build --no-default-features` produces a cdylib with zero contrib code. -default = ["hdfs-opendal", "contrib-example"] +# Released cdylib ships with hdfs-opendal only -- no contrib surface. This keeps +# `registered_contrib_kinds()` empty in production so users see only the contribs they +# explicitly opted into (Delta, Iceberg, ...). CI / dev builds turn on `contrib-example` +# (and the example's unit tests run under its own crate's test profile, which always +# links the example regardless of this list). +default = ["hdfs-opendal"] hdfs = ["datafusion-comet-objectstore-hdfs"] hdfs-opendal = ["opendal", "object_store_opendal", "hdfs-sys"] jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"] diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 54d7235ef6..84fb941bff 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1971,6 +1971,23 @@ impl PhysicalPlanner { lookup_contrib_planner_by_kind, CorePlannerContext, }; let kind = contrib_op.kind.as_str(); + + // Payload-size guard. A malformed Spark-side serde could produce a + // multi-GB payload that the planner would happily allocate during + // proto decode. 16 MiB is comfortably above any plausible + // file-scan payload (Delta with 100k tasks weighs in around 3-4 MiB) + // and well below "we should be worried about heap pressure". + const MAX_CONTRIB_PAYLOAD_BYTES: usize = 16 * 1024 * 1024; + if contrib_op.payload.len() > MAX_CONTRIB_PAYLOAD_BYTES { + return Err(GeneralError(format!( + "ContribOp.kind={kind:?} payload size {} bytes exceeds limit \ + of {} bytes -- inspect the contrib's serde for accidental \ + data accumulation", + contrib_op.payload.len(), + MAX_CONTRIB_PAYLOAD_BYTES, + ))); + } + let planner = lookup_contrib_planner_by_kind(kind).ok_or_else(|| { GeneralError(format!( "No contrib planner registered for ContribOp.kind={kind:?}; \ diff --git a/native/core/src/execution/planner/contrib.rs b/native/core/src/execution/planner/contrib.rs index 834c57b0c2..44672b4704 100644 --- a/native/core/src/execution/planner/contrib.rs +++ b/native/core/src/execution/planner/contrib.rs @@ -77,9 +77,8 @@ impl ContribPlannerContext for CorePlannerContext<'_> { &self, url: String, configs: &HashMap, - ) -> Result { + ) -> Result<(ObjectStoreUrl, object_store::path::Path), ContribError> { prepare_object_store_with_configs(self.planner.session_ctx().runtime_env(), url, configs) - .map(|(url, _path)| url) .map_err(|e| ContribError::Plan(format!("prepare_object_store_with_configs: {e}"))) } @@ -108,3 +107,62 @@ impl ContribPlannerContext for CorePlannerContext<'_> { .map_err(|e| ContribError::Plan(format!("init_datasource_exec: {e}"))) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::execution::planner::PhysicalPlanner; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::execution::context::SessionContext; + use datafusion::execution::object_store::ObjectStoreUrl; + + #[test] + fn core_planner_context_builds_parquet_exec_with_expected_schema() { + // Smoke test for the adapter: build a minimal DataSourceExec through the SPI + // trait method and verify the schema flowed through. Catches a coarse class of + // bugs where init_datasource_exec call-site args go out of order -- a swap that + // sent `required_schema` into the `data_schema` slot would produce a different + // output schema. + let session_ctx = Arc::new(SessionContext::new()); + let planner = PhysicalPlanner::new(Arc::clone(&session_ctx), 0); + let ctx = CorePlannerContext { planner: &planner }; + + let schema: SchemaRef = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ])); + let url = ObjectStoreUrl::parse("file://").unwrap(); + let params = ParquetDatasourceParams::new(Arc::clone(&schema), url, vec![]) + .with_session_timezone("UTC") + .with_case_sensitive(true); + + let exec = ctx + .build_parquet_datasource_exec(params) + .expect("adapter should build a DataSourceExec"); + + // The exec's reported schema must equal the required_schema we passed in. + let out_schema = exec.schema(); + assert_eq!(out_schema.fields().len(), 2); + assert_eq!(out_schema.field(0).name(), "id"); + assert_eq!(out_schema.field(1).name(), "name"); + } + + #[test] + fn core_planner_context_session_ctx_round_trip() { + let session_ctx = Arc::new(SessionContext::new()); + let planner = PhysicalPlanner::new(Arc::clone(&session_ctx), 0); + let ctx = CorePlannerContext { planner: &planner }; + // Arc identity check -- the contrib gets back the same SessionContext core was + // built with, not a copy. + assert!(Arc::ptr_eq(ctx.session_ctx(), &session_ctx)); + } + + #[test] + fn core_planner_context_converts_empty_schema() { + let session_ctx = Arc::new(SessionContext::new()); + let planner = PhysicalPlanner::new(Arc::clone(&session_ctx), 0); + let ctx = CorePlannerContext { planner: &planner }; + let schema = ctx.convert_spark_schema(&[]); + assert_eq!(schema.fields().len(), 0); + } +} diff --git a/native/core/src/execution/planner/operator_registry.rs b/native/core/src/execution/planner/operator_registry.rs index 81d5151717..302c3c9489 100644 --- a/native/core/src/execution/planner/operator_registry.rs +++ b/native/core/src/execution/planner/operator_registry.rs @@ -159,3 +159,26 @@ fn get_operator_type(spark_operator: &Operator) -> Option { OpStruct::ContribOp(_) => None, } } + +#[cfg(test)] +mod tests { + use super::*; + use datafusion_comet_proto::spark_operator::{operator::OpStruct, ContribOp, Operator}; + + #[test] + fn contrib_op_is_not_handled_by_in_tree_registry() { + // Guard against a future refactor that wires ContribOp into the in-tree + // operator registry by accident (which would double-dispatch contribs). + let op = Operator { + op_struct: Some(OpStruct::ContribOp(ContribOp { + kind: "anything".into(), + payload: vec![], + })), + ..Default::default() + }; + assert!( + get_operator_type(&op).is_none(), + "ContribOp must not be mapped to an in-tree OperatorType" + ); + } +} diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala index 469fc0b409..311d2d2a6f 100644 --- a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala +++ b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala @@ -87,10 +87,9 @@ class CometSparkSessionExtensions with Logging with ShimCometSparkSessionExtensions { override def apply(extensions: SparkSessionExtensions): Unit = { - // Discover contrib extensions on the classpath BEFORE registering our rules so that - // CometScanRule / CometExecRule see the contribs the first time they run. Idempotent - // and safe to call multiple times across SparkSession instances within the same JVM. - org.apache.comet.spi.CometExtensionRegistry.load() + // Note: contrib extension discovery happens lazily inside CometScanRule / + // CometExecRule (the first time either runs against a Comet-enabled session). + // Sessions that never enable Comet pay zero ServiceLoader cost. extensions.injectColumnar { session => CometScanColumnar(session) } extensions.injectColumnar { session => CometExecColumnar(session) } diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index a1d324065f..ea81110470 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -351,15 +351,15 @@ case class CometExecRule(session: SparkSession) // registered handler for creating a fully native plan if (op.children.forall(_.isInstanceOf[CometNativeExec])) { // Contrib SPI: each registered CometOperatorSerdeExtension contributes a - // SparkPlan-class -> CometOperatorSerde map. We merge those over `allExecs` - // here so contrib operators (e.g. a future CometDeltaNativeScanExec from a - // delta contrib) get dispatched the same way built-in operators do. Contribs - // own classes that aren't in `allExecs`, so this merge never overrides a core - // mapping in practice. - val contribSerdes = - CometExtensionRegistry.serdeExtensions.flatMap(_.serdes).toMap - val handler = (allExecs ++ contribSerdes) + // SparkPlan-class -> CometOperatorSerde map. The merged map is pre-computed + // once at registry load time (CometExtensionRegistry.mergedSerdes) so we + // don't rebuild a HashMap on every operator transform. Contribs own classes + // that aren't in `allExecs`, so this merge never overrides a core mapping in + // practice; duplicate-class detection at load() time logs a warning if it + // does happen. + val handler = allExecs .get(op.getClass) + .orElse(CometExtensionRegistry.mergedSerdes.get(op.getClass)) .map(_.asInstanceOf[CometOperatorSerde[SparkPlan]]) handler match { case Some(handler) => diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 20410faa0e..d0f98cd189 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -76,6 +76,11 @@ case class CometScanRule(session: SparkSession) private def _apply(plan: SparkPlan): SparkPlan = { if (!isCometLoaded(conf)) return plan + // Lazy contrib discovery: by the time we get here Comet is enabled. load() is + // idempotent so subsequent invocations across plans / sessions are free. Sessions + // that never reach this point pay zero ServiceLoader cost. + CometExtensionRegistry.load() + // Comet does not support structured streaming. The parallel guard in // CometExecRule only stops operator wrapping, so without this check we // would still rewrite scans to CometScanExec in a streaming plan. @@ -121,8 +126,38 @@ case class CometScanRule(session: SparkSession) // `PreprocessTableWithDVs` is the canonical case). Fold in registration order so // contribs see each other's outputs deterministically. Extensions that don't override // `preTransform` inherit the trait's identity default -- zero overhead. - val prepped = CometExtensionRegistry.scanExtensions - .foldLeft(plan)((p, ext) => ext.preTransform(p, session)) + // + // Gated on COMET_NATIVE_SCAN_ENABLED: if the user has disabled Comet scan, the + // contribs' Catalyst wrappers (Delta's DV filter, etc.) are load-bearing and stripping + // them turns into a correctness bug. Leave the plan tree as Spark wrote it. + // + // Corruption guard: snapshot scan classes before each extension's pass and after; if + // a non-matching scan's class identity changed, log a warning naming the extension. + // Contribs' `preTransform` MUST only rewrite scans they recognise; this guard catches + // the common violation early. Light overhead (one collect per extension); only fires + // a warning when the contract is broken. + val prepped = + if (!CometConf.COMET_NATIVE_SCAN_ENABLED.get(conf)) { + plan + } else { + CometExtensionRegistry.scanExtensions.foldLeft(plan) { (p, ext) => + val before = p.collect { case s: FileSourceScanExec => s } + val after = ext.preTransform(p, session) + val afterScans = after.collect { case s: FileSourceScanExec => s } + if (before.size == afterScans.size) { + before.zip(afterScans).foreach { case (b, a) => + if ((b ne a) && b.getClass == a.getClass && !ext.matchesV1(b.relation)) { + logWarning( + s"CometScanRuleExtension '${ext.name}'.preTransform replaced a " + + s"FileSourceScanExec it does not claim (matchesV1=false). This is a " + + s"contract violation -- preTransform must only rewrite scans the " + + s"extension recognises. See CometScanRuleExtension.preTransform doc.") + } + } + } + after + } + } val fullPlan = prepped @@ -172,21 +207,22 @@ case class CometScanRule(session: SparkSession) } // Contrib SPI dispatch: offer the scan to every registered CometScanRuleExtension - // before core's built-in file-format logic. The first extension whose `matchesV1` - // returns true gets `transformV1` called -- if that returns Some, the result replaces - // the scan branch entirely. Returning None means "I matched but ultimately can't - // accelerate this one", and core's existing logic handles it. Iterating in - // registration order makes contrib selection deterministic. + // before core's built-in file-format logic. Loop in registration order; the FIRST + // extension whose `matchesV1` returns true AND whose `transformV1` returns Some(_) + // wins -- its replacement plan is returned. An extension that returns None from + // `transformV1` means "I match this scan shape but decline to accelerate this + // specific instance"; the loop continues to the next extension before falling back + // to core's built-in file-format logic. This lets multiple contribs coexist (e.g. + // Iceberg + Delta both loaded) without one's decline silently masking another. scanExec.relation match { case r: HadoopFsRelation => - val matched = CometExtensionRegistry.scanExtensions.find(_.matchesV1(r)) - matched match { - case Some(ext) => - ext.transformV1(plan, scanExec, session) match { - case Some(replacement) => return replacement - case None => // extension matched but declined; fall through - } - case None => // no extension matched; fall through + val replacement = CometExtensionRegistry.scanExtensions.iterator + .filter(_.matchesV1(r)) + .flatMap(ext => ext.transformV1(plan, scanExec, session)) + .nextOption() + replacement match { + case Some(plan) => return plan + case None => // no extension produced a replacement; fall through } case _ => // SPI only operates on HadoopFsRelation V1 scans } @@ -289,16 +325,16 @@ case class CometScanRule(session: SparkSession) private def transformV2Scan(scanExec: BatchScanExec): SparkPlan = { - // Contrib SPI dispatch (V2): same shape as transformV1Scan above. First matching - // extension wins; None return falls through to core's logic. - val matched = CometExtensionRegistry.scanExtensions.find(_.matchesV2(scanExec)) - matched match { - case Some(ext) => - ext.transformV2(scanExec, session) match { - case Some(replacement) => return replacement - case None => // extension matched but declined; fall through - } - case None => // no extension matched; fall through + // Contrib SPI dispatch (V2): mirrors transformV1Scan. Loop in registration order; + // first matching extension whose transformV2 returns Some wins. Decline = continue + // to next extension. + val replacement = CometExtensionRegistry.scanExtensions.iterator + .filter(_.matchesV2(scanExec)) + .flatMap(ext => ext.transformV2(scanExec, session)) + .nextOption() + replacement match { + case Some(plan) => return plan + case None => // no extension produced a replacement; fall through } scanExec.scan match { diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index 5d17e0468e..b262d80785 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -60,7 +60,19 @@ object CometExtensionRegistry extends Logging { s"Comet contrib extensions loaded: " + s"scan=[${scanExts.map(_.name).mkString(", ")}], " + s"serde=[${serdeExts.map(_.name).mkString(", ")}]") + detectDuplicateSerdeClasses(serdeExts) + } else { + // Positive signal that discovery ran. Some Spark deploy modes (Ivy `--packages`, + // isolated UDF classloaders) put Comet on a classloader that the TCCL fallback + // doesn't see; absent extensions go silent without this line. + logInfo( + "Comet contrib extensions: none discovered on classpath " + + "(no META-INF/services entries for CometScanRuleExtension or " + + "CometOperatorSerdeExtension)") } + // Build the merged exec map once at load time. CometExecRule reads it on every + // operator transform; rebuilding per-call would be wasteful. + mergedSerdesCache = serdeExts.flatMap(_.serdes).toMap } } @@ -70,6 +82,49 @@ object CometExtensionRegistry extends Logging { /** Registered operator-serde extensions, in classpath discovery order. */ def serdeExtensions: Seq[CometOperatorSerdeExtension] = serdeExts + /** + * Pre-merged serde map across all registered extensions, keyed by the `Class[_ <: SparkPlan]` + * the contrib uses for class-keyed dispatch in `CometExecRule`. Computed once at `load()` time; + * an empty map until `load()` has run. + */ + def mergedSerdes: Map[Class[_ <: org.apache.spark.sql.execution.SparkPlan], + org.apache.comet.serde.CometOperatorSerde[_]] = mergedSerdesCache + + @volatile private var mergedSerdesCache + : Map[Class[_ <: org.apache.spark.sql.execution.SparkPlan], + org.apache.comet.serde.CometOperatorSerde[_]] = Map.empty + + /** + * Log a warning when two registered contribs claim the same `Class[_ <: SparkPlan]` for serde + * dispatch. The convention documented in `contrib-extensions.md` is that each contrib defines + * its own exec class and registers a serde keyed on that class; a collision usually means a + * contrib subclassed a core exec by mistake. + * + * Detection only -- the last-write-wins toMap behavior stands. We log so the user has a chance + * to notice; preventing the override would be a harder migration path (silent drop of one + * contrib's exec). + */ + private def detectDuplicateSerdeClasses(exts: Seq[CometOperatorSerdeExtension]): Unit = { + val perClassOwners = scala.collection.mutable.Map + .empty[Class[_ <: org.apache.spark.sql.execution.SparkPlan], scala.collection.mutable.ArrayBuffer[String]] + exts.foreach { ext => + ext.serdes.keys.foreach { cls => + perClassOwners + .getOrElseUpdate(cls, scala.collection.mutable.ArrayBuffer.empty) + .+=(ext.name) + } + } + perClassOwners.foreach { case (cls, owners) => + if (owners.size > 1) { + logWarning( + s"Multiple Comet contrib extensions claim the same exec class " + + s"${cls.getName}: [${owners.mkString(", ")}]. Last-write-wins; " + + s"this usually indicates a contrib has subclassed a core or " + + s"another contrib's exec instead of defining its own.") + } + } + } + /** * Test-only: reset the registry to the empty state. Lets unit tests re-run discovery with a * different classpath / overridden services. Not for production use. @@ -78,6 +133,7 @@ object CometExtensionRegistry extends Logging { loaded.set(false) scanExts = Seq.empty serdeExts = Seq.empty + mergedSerdesCache = Map.empty } private def loadOne[T](label: String)(implicit ct: scala.reflect.ClassTag[T]): Seq[T] = { diff --git a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala index 376607d518..db57d17eb2 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala @@ -31,9 +31,12 @@ import org.apache.spark.sql.execution.datasources.v2.BatchScanExec * * `CometScanRule` discovers implementations via `CometExtensionRegistry.scanExtensions` * (ServiceLoader-backed) and offers each candidate scan to every registered extension in - * registration order. The first extension whose [[matches]] returns `true` wins -- its - * [[transformV1]] / [[transformV2]] is called and the returned plan replaces the scan branch. If - * no extension matches, the core's existing file-format dispatch handles the scan as before. + * registration order. The first extension whose [[matchesV1]] (or [[matchesV2]]) returns true + * AND whose [[transformV1]] (or [[transformV2]]) returns `Some(_)` wins -- its returned plan + * replaces the scan subtree. An extension whose `matches` is true but whose `transform` returns + * `None` is treated as "declined this instance"; dispatch continues to the next matching + * extension. After every matching extension has declined, core's built-in file-format dispatch + * handles the scan as before. * * Contribs are discovered via the standard Java ServiceLoader. Each contrib JAR ships a * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` resource listing its extension @@ -60,14 +63,28 @@ trait CometScanRuleExtension { * produced and the downstream `Filter` silently drops every row. The Delta contrib's * `preTransform` strips the wrapper so the clean scan reaches per-scan dispatch. * - * Implementations MUST NOT modify scans they don't recognise. Multiple registered + * '''V1-only.''' `preTransform` runs once for the whole plan and the rewritten tree is + * what later `transformV1` calls see via their `plan` argument. `transformV2` does NOT + * receive a plan-tree reference -- only the matched `BatchScanExec`. V2 contribs that need + * wrapper-stripping must do that work inside `transformV2` against `scanExec.scan` / + * `scanExec.children` directly. + * + * '''Disabled when scan conversion is off.''' `CometScanRule` skips the entire preTransform + * fold when `spark.comet.scan.enabled=false`. A contrib's own wrappers (Delta's DV filter, + * etc.) are load-bearing in that case; stripping them turns into a correctness bug. + * + * '''MUST NOT modify scans the extension does not recognise.''' Multiple registered * extensions are folded over the plan in registration order; an extension that rewrites * scans outside its format's domain will silently corrupt other formats' plans. + * `CometScanRule` logs a warning when a `FileSourceScanExec` is replaced by an extension + * whose `matchesV1` returns false against the original scan's relation -- contribs that + * trip this warning should narrow their pattern match. * - * Shared state between this pre-pass and later `transformV1` / `transformV2` calls is the - * contrib's problem. The recommended pattern is to attach a Spark `TreeNodeTag` to nodes - * during `preTransform` and read it during `transformV1`. Spark's tag mechanism is - * tree-immutable-safe and survives plan transformations. + * '''State sharing.''' Shared state between this pre-pass and later `transformV1` calls + * is the contrib's problem. The recommended pattern is to attach a Spark `TreeNodeTag` + * to nodes during `preTransform` and read it during `transformV1`. Spark's tag mechanism + * is tree-immutable-safe and survives plan transformations -- preferred over external + * mutable state which leaks across plans. */ def preTransform(plan: SparkPlan, session: SparkSession): SparkPlan = plan @@ -83,8 +100,11 @@ trait CometScanRuleExtension { /** * Transform the matched V1 scan. Called only when `matchesV1` returned true. * - * Returning `None` means "I matched but ultimately can't accelerate this one" -- the core falls - * back to its existing file-format dispatch. Returning `Some(plan)` replaces the scan subtree. + * Returning `None` means "I matched the scan shape but ultimately can't accelerate this + * specific instance" -- `CometScanRule` then continues to the NEXT registered extension + * whose `matchesV1` is true, falling back to core's built-in file-format dispatch only + * after every matching extension has declined. Returning `Some(plan)` ends dispatch and + * replaces the scan subtree with `plan`. */ def transformV1( plan: SparkPlan, @@ -100,6 +120,12 @@ trait CometScanRuleExtension { /** * Transform the matched V2 scan. Called only when `matchesV2` returned true. + * + * Same semantics as `transformV1`: `None` falls through to the next matching extension; + * `Some(plan)` ends dispatch. Note that unlike `transformV1`, this method does NOT + * receive a plan-tree reference -- `preTransform` rewrites are not visible here. V2 + * contribs that need wrapper-stripping must operate on `scanExec.scan` / + * `scanExec.children` directly. */ def transformV2(scanExec: BatchScanExec, session: SparkSession): Option[SparkPlan] = None } From 68fff43f33a14e5fe88753300b8b3a4c506f46e1 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 12:08:49 -0400 Subject: [PATCH 12/27] feat(contrib): second-pass review fixes (R1-R7, N-NEW-1/2/7) Regressions - R1: CometExecRule._apply now calls CometExtensionRegistry.load() at the top (after isCometLoaded). Previously only CometScanRule.load()ed; rule-injection order changes or partial injection would have left mergedSerdes empty and silently un-dispatched contribs. - R2: CometExtensionRegistry class docstring updated to match the lazy-load semantics introduced in the first-pass fix. - R3: Three remaining dead references to docs/contrib-delta-migration-plan.md rewritten to point at contrib-extensions.md (native/core/Cargo.toml, root pom.xml, PR1-description.md). - R4: ContribError::Display wildcard arm now emits {self:?} instead of "unknown contrib error" so future variants under #[non_exhaustive] keep their debug-repr message through the dispatcher's format chain. - R5/R6: preTransform corruption guard rewritten to scan-identity check. Snapshots every FileSourceScanExec the extension does NOT claim before the pass, then verifies each one survives in the rewritten tree by reference equality. Catches class-changing replacements (which the old check missed) and is robust to plan-tree reordering (which the old zip-by-position was not). - R7: New unit test core_planner_context_encryption_flag_reaches_init_datasource_exec uses the encryption_enabled asymmetry (true triggers a factory lookup that fails when no factory is configured; false silently succeeds) to verify that bool reaches the right positional slot in init_datasource_exec. A swap with case_sensitive / use_field_id / etc. would now fail this test. Surface tweak - N-NEW-1: ParquetDatasourceParams::session_timezone switched from &'a str to owned String. with_session_timezone now accepts `impl Into` so contribs can pass runtime-computed timezones (from a session config lookup) without juggling lifetimes. ParquetDatasourceParams loses its lifetime parameter entirely. Dispatcher - N-NEW-2: Payload-size guard moved to AFTER the planner lookup. A bogus kind now produces the "not registered" error rather than misleadingly blaming an oversized payload. CI / regression guard - N-NEW-7: New unit test production_build_has_no_contrib_planners_registered, gated on `#[cfg(not(feature = "contrib-example"))]`, asserts the default cdylib carries zero contrib surface. Catches an accidental re-introduction of a contrib into core's `default = [...]` feature set. Verified - cargo check (default features): green. - cargo test -p datafusion-comet --lib --no-default-features: 135 tests pass including the new production-canary. - cargo test -p datafusion-comet --lib -- execution::planner::contrib: 4 tests pass including the encryption-flag witness. - cargo test -p comet-contrib-spi -p comet-contrib-example: 10 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/native/src/lib.rs | 2 +- native/contrib-spi/src/lib.rs | 27 ++++--- native/core/Cargo.toml | 6 +- native/core/src/execution/planner.rs | 27 ++++--- native/core/src/execution/planner/contrib.rs | 73 ++++++++++++++++++- pom.xml | 6 +- .../apache/comet/rules/CometExecRule.scala | 6 ++ .../apache/comet/rules/CometScanRule.scala | 41 +++++++---- .../comet/spi/CometExtensionRegistry.scala | 5 +- 9 files changed, 147 insertions(+), 46 deletions(-) diff --git a/contrib/example/native/src/lib.rs b/contrib/example/native/src/lib.rs index 24061f8f53..8bd0753fe2 100644 --- a/contrib/example/native/src/lib.rs +++ b/contrib/example/native/src/lib.rs @@ -186,7 +186,7 @@ mod tests { } fn build_parquet_datasource_exec( &self, - _params: ParquetDatasourceParams<'_>, + _params: ParquetDatasourceParams, ) -> Result, ContribError> { unimplemented!("TestCtx: build_parquet_datasource_exec not used by this test") } diff --git a/native/contrib-spi/src/lib.rs b/native/contrib-spi/src/lib.rs index 9f6bf4959c..02881a263a 100644 --- a/native/contrib-spi/src/lib.rs +++ b/native/contrib-spi/src/lib.rs @@ -83,8 +83,12 @@ pub trait ContribOperatorPlanner: Send + Sync { /// `#[non_exhaustive]` so adding fields in future is a minor SemVer bump, not a break. /// Contribs construct via [`ParquetDatasourceParams::new`] (required fields only) + /// `with_*` builder setters; never by struct-literal syntax. +/// +/// `session_timezone` is owned (`String`) so contribs can pass a runtime-computed value +/// (from a session config lookup) without juggling lifetimes. The string is one-time +/// per plan call, so the allocation is negligible. #[non_exhaustive] -pub struct ParquetDatasourceParams<'a> { +pub struct ParquetDatasourceParams { pub required_schema: SchemaRef, pub data_schema: Option, pub partition_schema: Option, @@ -93,7 +97,7 @@ pub struct ParquetDatasourceParams<'a> { pub projection_vector: Option>, pub data_filters: Option>>, pub default_values: Option>, - pub session_timezone: &'a str, + pub session_timezone: String, pub case_sensitive: bool, pub return_null_struct_if_all_fields_missing: bool, pub encryption_enabled: bool, @@ -101,7 +105,7 @@ pub struct ParquetDatasourceParams<'a> { pub ignore_missing_field_id: bool, } -impl<'a> ParquetDatasourceParams<'a> { +impl ParquetDatasourceParams { /// Minimal constructor with the parameters every parquet scan needs. All `Option`s /// default to `None`, all `bool`s to `false`, and `session_timezone` to `"UTC"`. Use /// the `with_*` setters to populate the rest. @@ -119,7 +123,7 @@ impl<'a> ParquetDatasourceParams<'a> { projection_vector: None, data_filters: None, default_values: None, - session_timezone: "UTC", + session_timezone: "UTC".to_string(), case_sensitive: false, return_null_struct_if_all_fields_missing: false, encryption_enabled: false, @@ -148,8 +152,10 @@ impl<'a> ParquetDatasourceParams<'a> { self.default_values = Some(values); self } - pub fn with_session_timezone(mut self, tz: &'a str) -> Self { - self.session_timezone = tz; + /// Accepts anything that can be turned into a `String` -- string literals, + /// `&str` borrowed from session config, owned `String`s -- without lifetime games. + pub fn with_session_timezone(mut self, tz: impl Into) -> Self { + self.session_timezone = tz.into(); self } pub fn with_case_sensitive(mut self, b: bool) -> Self { @@ -223,7 +229,7 @@ pub trait ContribPlannerContext { /// goes through here so the contrib doesn't have to rebuild Comet's parquet plumbing. fn build_parquet_datasource_exec( &self, - params: ParquetDatasourceParams<'_>, + params: ParquetDatasourceParams, ) -> Result, ContribError>; } @@ -256,10 +262,11 @@ impl std::fmt::Display for ContribError { ContribError::WrongChildCount { expected, actual } => { write!(f, "wrong child count: expected {expected}, got {actual}") } - // Wildcard arm so the match stays exhaustive after future #[non_exhaustive] - // additions. Reached only by `_` constructors that don't exist today. + // Wildcard for future variants added under #[non_exhaustive]. Use the Debug + // repr so the dispatcher's `format!("contrib planner ...: {e}")` carries a + // useful message rather than swallowing the variant. #[allow(unreachable_patterns)] - _ => write!(f, "unknown contrib error"), + other => write!(f, "{other:?}"), } } } diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml index 3fdab96398..0f42b2a6e1 100644 --- a/native/core/Cargo.toml +++ b/native/core/Cargo.toml @@ -112,9 +112,9 @@ jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"] # Contrib feature flags. Each flag pulls a contrib rlib into core's cdylib so contrib # Rust code is linked into the single libcomet at build time; the contrib's #[ctor] -# registers its operator planners during library init. See -# docs/contrib-delta-migration-plan.md for the architectural rationale (single cdylib -# instead of separate cdylib per contrib). +# registers its operator planners during library init. The single-cdylib architecture +# (rather than separate cdylib per contrib) is documented in +# docs/source/contributor-guide/contrib-extensions.md. contrib-example = ["dep:comet-contrib-example"] # exclude optional packages from cargo machete verifications diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 84fb941bff..6e5e1785f8 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1972,11 +1972,22 @@ impl PhysicalPlanner { }; let kind = contrib_op.kind.as_str(); + // Look up the planner first so a bogus kind produces the "not registered" + // error rather than a misleading "payload too big" one (in case the kind + // is garbage and the payload also happens to be oversized). + let planner = lookup_contrib_planner_by_kind(kind).ok_or_else(|| { + GeneralError(format!( + "No contrib planner registered for ContribOp.kind={kind:?}; \ + did you build core with the corresponding `contrib-{kind}` \ + Cargo feature (or its workspace equivalent)?" + )) + })?; + // Payload-size guard. A malformed Spark-side serde could produce a - // multi-GB payload that the planner would happily allocate during - // proto decode. 16 MiB is comfortably above any plausible - // file-scan payload (Delta with 100k tasks weighs in around 3-4 MiB) - // and well below "we should be worried about heap pressure". + // multi-GB payload that the planner would happily allocate during proto + // decode. 16 MiB is comfortably above any plausible file-scan payload + // (Delta with 100k tasks weighs in around 3-4 MiB) and well below "we + // should be worried about heap pressure". const MAX_CONTRIB_PAYLOAD_BYTES: usize = 16 * 1024 * 1024; if contrib_op.payload.len() > MAX_CONTRIB_PAYLOAD_BYTES { return Err(GeneralError(format!( @@ -1988,14 +1999,6 @@ impl PhysicalPlanner { ))); } - let planner = lookup_contrib_planner_by_kind(kind).ok_or_else(|| { - GeneralError(format!( - "No contrib planner registered for ContribOp.kind={kind:?}; \ - did you build core with the corresponding `contrib-{kind}` \ - Cargo feature (or its workspace equivalent)?" - )) - })?; - // Recursively build native children. The contrib gets them as // `Arc` rather than the richer `SparkPlan` because the // SPI is intentionally minimal — contribs only need the DataFusion-level diff --git a/native/core/src/execution/planner/contrib.rs b/native/core/src/execution/planner/contrib.rs index 44672b4704..1a3b742611 100644 --- a/native/core/src/execution/planner/contrib.rs +++ b/native/core/src/execution/planner/contrib.rs @@ -84,7 +84,7 @@ impl ContribPlannerContext for CorePlannerContext<'_> { fn build_parquet_datasource_exec( &self, - params: ParquetDatasourceParams<'_>, + params: ParquetDatasourceParams, ) -> Result, ContribError> { init_datasource_exec( params.required_schema, @@ -95,7 +95,7 @@ impl ContribPlannerContext for CorePlannerContext<'_> { params.projection_vector, params.data_filters, params.default_values, - params.session_timezone, + ¶ms.session_timezone, params.case_sensitive, params.return_null_struct_if_all_fields_missing, self.planner.session_ctx(), @@ -116,6 +116,24 @@ mod tests { use datafusion::execution::context::SessionContext; use datafusion::execution::object_store::ObjectStoreUrl; + /// Production-build assertion: when no contrib feature is enabled, the registry + /// must be empty. Catches an accidental re-introduction of a contrib into core's + /// `default = [...]` feature set. Compiled out under `--features contrib-example` + /// (the test binary always links its crate's dependencies, so this assertion would + /// be wrong under that flag). + #[cfg(not(feature = "contrib-example"))] + #[test] + fn production_build_has_no_contrib_planners_registered() { + // Direct read through the SPI's public API. This test is the canary for + // the contributor-guide claim that release builds carry zero contrib surface. + let kinds = comet_contrib_spi::registered_contrib_kinds(); + assert!( + kinds.is_empty(), + "default cdylib leaked contrib planners: {kinds:?}. \ + Check native/core/Cargo.toml's `default = [...]` for contrib features." + ); + } + #[test] fn core_planner_context_builds_parquet_exec_with_expected_schema() { // Smoke test for the adapter: build a minimal DataSourceExec through the SPI @@ -165,4 +183,55 @@ mod tests { let schema = ctx.convert_spark_schema(&[]); assert_eq!(schema.fields().len(), 0); } + + #[test] + fn core_planner_context_encryption_flag_reaches_init_datasource_exec() { + // Cross-crate positional-arg-swap guard. `init_datasource_exec` takes five `bool` + // parameters in a row (case_sensitive, return_null_struct_..., encryption_enabled, + // use_field_id, ignore_missing_field_id); a swap of two of them at the call site + // in `build_parquet_datasource_exec` would compile fine and break silently. We + // exploit the asymmetry that `encryption_enabled=true` triggers an encryption- + // factory lookup that fails when no factory is registered, while every other + // bool being `true` keeps the call succeeding. So: + // * Default (all bools false) -> Ok + // * Same call with `encryption_enabled=true` -> Err on factory lookup + // If a swap accidentally routed e.g. `use_field_id` into the encryption slot, the + // "default" variant below would fail (because use_field_id is true here in the + // params struct, and the swapped slot would now enable encryption). + let session_ctx = Arc::new(SessionContext::new()); + let planner = PhysicalPlanner::new(Arc::clone(&session_ctx), 0); + let ctx = CorePlannerContext { planner: &planner }; + + let schema: SchemaRef = Arc::new(Schema::new(vec![Field::new( + "id", + DataType::Int64, + false, + )])); + let url = ObjectStoreUrl::parse("file://").unwrap(); + + // Witness #1: all five bools `true` EXCEPT encryption_enabled. Must succeed -- + // confirms case_sensitive / use_field_id / etc. are NOT routed into the + // encryption slot. + let no_encryption = ParquetDatasourceParams::new(Arc::clone(&schema), url.clone(), vec![]) + .with_case_sensitive(true) + .with_return_null_struct_if_all_fields_missing(true) + .with_use_field_id(true) + .with_ignore_missing_field_id(true) + .with_encryption_enabled(false); + ctx.build_parquet_datasource_exec(no_encryption) + .expect("encryption_enabled=false must not trigger factory lookup"); + + // Witness #2: only encryption_enabled is true. Must fail with the encryption-factory + // not-found error. Confirms encryption_enabled actually reaches the encryption slot. + let with_encryption = + ParquetDatasourceParams::new(Arc::clone(&schema), url, vec![]).with_encryption_enabled(true); + let err = ctx + .build_parquet_datasource_exec(with_encryption) + .expect_err("encryption_enabled=true should fail without a factory"); + let msg = format!("{err}"); + assert!( + msg.contains("encryption") || msg.contains("Encryption") || msg.contains("factory"), + "expected encryption-factory error, got: {msg}" + ); + } } diff --git a/pom.xml b/pom.xml index 7660b1976c..685e474d59 100644 --- a/pom.xml +++ b/pom.xml @@ -41,9 +41,9 @@ under the License. contrib/example diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index ea81110470..94d7465938 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -554,6 +554,12 @@ case class CometExecRule(session: SparkSession) // We shouldn't transform Spark query plan if Comet is not loaded. if (!isCometLoaded(conf)) return plan + // Lazy contrib discovery. Mirrors the call in CometScanRule._apply -- either rule may + // be the first to run depending on which path of the plan tree fires first. load() is + // idempotent (AtomicBoolean gate), so the duplicate call is a no-op in steady state + // but makes each rule self-contained instead of relying on CometScanRule running first. + CometExtensionRegistry.load() + // Comet does not support structured streaming. Fall back to Spark for any plan that // belongs to a streaming query (detected via StreamSourceAwareSparkPlan.getStream). if (ShimCometStreaming.isStreamingPlan(plan)) return plan diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index d0f98cd189..8aa4e4bd5a 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -131,27 +131,42 @@ case class CometScanRule(session: SparkSession) // contribs' Catalyst wrappers (Delta's DV filter, etc.) are load-bearing and stripping // them turns into a correctness bug. Leave the plan tree as Spark wrote it. // - // Corruption guard: snapshot scan classes before each extension's pass and after; if - // a non-matching scan's class identity changed, log a warning naming the extension. + // Corruption guard: snapshot every FileSourceScanExec the extension does NOT claim + // before the pass, and verify each one is still present (by reference) afterwards. // Contribs' `preTransform` MUST only rewrite scans they recognise; this guard catches - // the common violation early. Light overhead (one collect per extension); only fires - // a warning when the contract is broken. + // the most dangerous violation (a contrib stripping or substituting an unrelated + // format's scan) regardless of whether the replacement keeps the same SparkPlan + // class. Light overhead (one collect per extension + one identity-Set check); only + // fires a warning when the contract is broken. val prepped = if (!CometConf.COMET_NATIVE_SCAN_ENABLED.get(conf)) { plan } else { CometExtensionRegistry.scanExtensions.foldLeft(plan) { (p, ext) => - val before = p.collect { case s: FileSourceScanExec => s } + val unclaimedBefore = p.collect { + case s: FileSourceScanExec if !ext.matchesV1(s.relation) => s + } val after = ext.preTransform(p, session) - val afterScans = after.collect { case s: FileSourceScanExec => s } - if (before.size == afterScans.size) { - before.zip(afterScans).foreach { case (b, a) => - if ((b ne a) && b.getClass == a.getClass && !ext.matchesV1(b.relation)) { + if (unclaimedBefore.nonEmpty) { + // Identity-equality check (reference compare) -- detects removal or + // substitution of a scan the extension doesn't own, including replacements + // whose SparkPlan class differs from the original. Plan-tree reordering is + // tolerated (we don't care WHERE the scan ended up, only that it still + // exists in the tree). + val survivors = scala.collection.mutable.Set.empty[FileSourceScanExec] + after.foreach { + case s: FileSourceScanExec => survivors += s + case _ => + } + unclaimedBefore.foreach { b => + if (!survivors.exists(_ eq b)) { logWarning( - s"CometScanRuleExtension '${ext.name}'.preTransform replaced a " + - s"FileSourceScanExec it does not claim (matchesV1=false). This is a " + - s"contract violation -- preTransform must only rewrite scans the " + - s"extension recognises. See CometScanRuleExtension.preTransform doc.") + s"CometScanRuleExtension '${ext.name}'.preTransform removed or " + + s"replaced a FileSourceScanExec it does not claim " + + s"(matchesV1=false on its relation, ${b.relation.fileFormat}). " + + s"This is a contract violation -- preTransform must only rewrite " + + s"scans the extension recognises. See " + + s"CometScanRuleExtension.preTransform doc.") } } } diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index b262d80785..cf9ba25525 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -35,8 +35,9 @@ import org.apache.spark.internal.Logging * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource on the Comet * classloader. Subsequent calls are no-ops. * - * `CometSparkSessionExtensions.apply` calls `load()` during Comet extension installation (PR1.6) - * so contrib JARs are picked up automatically when present. + * `load()` is invoked lazily from `CometScanRule._apply` and `CometExecRule._apply` the first + * time either rule runs against a Comet-enabled session. Spark sessions that never enable Comet + * pay zero ServiceLoader cost. * * Failures to instantiate individual extensions are logged but do NOT fail Comet startup -- a * misconfigured contrib JAR shouldn't take down the whole Spark session. From e4e6e6c6c8d9910bd63220fe25edfc45175b3d8e Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 13:52:43 -0400 Subject: [PATCH 13/27] feat(contrib): third-pass review fixes (R-NEW-1/2, N1-N8) Regressions - R-NEW-1: CometScanRule preTransform corruption guard switched from mutable.Set[FileSourceScanExec] (Spark case-class equality) to a Vector with `_ eq b` lookup. Two value-equal-but-reference-distinct scans (e.g., self-join after AQE dedup) no longer trigger a false-positive warning. Cost stays O(K * (P + S)). - R-NEW-2: CometExtensionRegistry.load() now runs inside `synchronized` with explicit publication order (write @volatile fields, THEN flip `loaded`). The previous AtomicBoolean-only gate let Thread B observe `loaded=true` and read Seq.empty/Map.empty while Thread A was still loading. AQE concurrent rule application across sub-queries now sees consistent registry state. Polish - N1: Cost comment added to the preTransform guard fold. - N2: Guard comment notes V2 BatchScanExec is out of scope by design. - N3: ContribOp dispatcher now rejects empty `kind` with a dedicated error ("the JVM-side serde produced a malformed envelope") instead of the misleading "build core with `contrib-` feature" message. - N4: Payload-size guard comment corrected -- prost has already decoded the payload by the time we get here; the guard fences the contrib's plan() body, not the original allocation. - N5: Scope limitation documented on the encryption-asymmetry test -- catches swaps involving the encryption_enabled slot only; new bools must come with their own asymmetry witness. - N6: Production canary cfg switched to `not(any(...))` form with a MAINTENANCE comment listing the contract for future contrib features. - N7: resetForTesting visibility widened from `private[comet]` to public; docstring explains that contribs are not required to package under org.apache.comet.* and must still be able to reset between tests. - N8: ContribError::Display wildcard comment clarified -- the wildcard defends downstream Display-as-source consumers; inside the defining crate the match must be exhaustive anyway. Verified - cargo check default features: green. - cargo test -p datafusion-comet --lib -- execution::planner::contrib: 5 tests pass (added 1, was 4). - cargo test -p comet-contrib-spi -p comet-contrib-example: 10 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- native/contrib-spi/src/lib.rs | 11 +++- native/core/src/execution/planner.rs | 22 +++++-- native/core/src/execution/planner/contrib.rs | 20 ++++-- .../apache/comet/rules/CometScanRule.scala | 24 +++++--- .../comet/spi/CometExtensionRegistry.scala | 61 ++++++++++++------- 5 files changed, 95 insertions(+), 43 deletions(-) diff --git a/native/contrib-spi/src/lib.rs b/native/contrib-spi/src/lib.rs index 02881a263a..f92c6dde93 100644 --- a/native/contrib-spi/src/lib.rs +++ b/native/contrib-spi/src/lib.rs @@ -262,9 +262,14 @@ impl std::fmt::Display for ContribError { ContribError::WrongChildCount { expected, actual } => { write!(f, "wrong child count: expected {expected}, got {actual}") } - // Wildcard for future variants added under #[non_exhaustive]. Use the Debug - // repr so the dispatcher's `format!("contrib planner ...: {e}")` carries a - // useful message rather than swallowing the variant. + // Defense for external callers reading `Display` after a future variant is + // added under #[non_exhaustive]: their `match` is non-exhaustive even with + // a wildcard, but our `Display` impl always falls through to the Debug repr + // so the dispatcher's `format!("contrib planner ...: {e}")` still produces + // something useful. (Note: inside this crate the wildcard is unreachable + // because #[non_exhaustive] is only enforced across crate boundaries -- + // adding a variant here will require an explicit arm anyway. The wildcard + // exists to keep downstream `Display`-as-source consumers working.) #[allow(unreachable_patterns)] other => write!(f, "{other:?}"), } diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 6e5e1785f8..f8848042f5 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1971,6 +1971,13 @@ impl PhysicalPlanner { lookup_contrib_planner_by_kind, CorePlannerContext, }; let kind = contrib_op.kind.as_str(); + if kind.is_empty() { + return Err(GeneralError( + "ContribOp.kind is empty -- the JVM-side serde produced a malformed \ + envelope (every contrib must set a stable kind string)" + .into(), + )); + } // Look up the planner first so a bogus kind produces the "not registered" // error rather than a misleading "payload too big" one (in case the kind @@ -1983,11 +1990,16 @@ impl PhysicalPlanner { )) })?; - // Payload-size guard. A malformed Spark-side serde could produce a - // multi-GB payload that the planner would happily allocate during proto - // decode. 16 MiB is comfortably above any plausible file-scan payload - // (Delta with 100k tasks weighs in around 3-4 MiB) and well below "we - // should be worried about heap pressure". + // Payload-size guard. By the time we get here prost has already decoded + // `contrib_op.payload` into a heap-allocated Vec, so this guard does + // NOT fence the proto-decode allocation itself. What it does fence: the + // contrib's plan() body from being invoked with an absurd payload, which + // typically does its own prost decode against contrib-private types -- + // potentially several more allocations. 16 MiB is comfortably above any + // plausible file-scan payload (Delta with 100k tasks weighs in around + // 3-4 MiB) and well below "we should be worried about heap pressure". + // Moving the check pre-decode would require a streaming Operator parser; + // not worth the complexity given typical payloads are <1 MiB. const MAX_CONTRIB_PAYLOAD_BYTES: usize = 16 * 1024 * 1024; if contrib_op.payload.len() > MAX_CONTRIB_PAYLOAD_BYTES { return Err(GeneralError(format!( diff --git a/native/core/src/execution/planner/contrib.rs b/native/core/src/execution/planner/contrib.rs index 1a3b742611..1cf9c15179 100644 --- a/native/core/src/execution/planner/contrib.rs +++ b/native/core/src/execution/planner/contrib.rs @@ -118,10 +118,14 @@ mod tests { /// Production-build assertion: when no contrib feature is enabled, the registry /// must be empty. Catches an accidental re-introduction of a contrib into core's - /// `default = [...]` feature set. Compiled out under `--features contrib-example` - /// (the test binary always links its crate's dependencies, so this assertion would - /// be wrong under that flag). - #[cfg(not(feature = "contrib-example"))] + /// `default = [...]` feature set. Compiled out under any active contrib feature + /// (the test binary always links its crate's dependencies, so the assertion would + /// be wrong under those flags). + /// + /// MAINTENANCE: when adding a new `contrib-` feature to `native/core/Cargo.toml`, + /// extend the `not(any(...))` predicate below with the new feature name so the + /// canary still compiles under that contrib's standalone CI matrix entry. + #[cfg(not(any(feature = "contrib-example")))] #[test] fn production_build_has_no_contrib_planners_registered() { // Direct read through the SPI's public API. This test is the canary for @@ -198,6 +202,14 @@ mod tests { // If a swap accidentally routed e.g. `use_field_id` into the encryption slot, the // "default" variant below would fail (because use_field_id is true here in the // params struct, and the swapped slot would now enable encryption). + // + // SCOPE: this test catches swaps that involve the `encryption_enabled` slot. + // Swaps among the other four bools (case_sensitive / return_null_... / + // use_field_id / ignore_missing_field_id) are NOT caught -- the two witnesses + // below either set all four to true (witness #1) or all four to false + // (witness #2), so a permutation among them is invisible. Adding a new bool to + // ParquetDatasourceParams / init_datasource_exec should be accompanied by a new + // asymmetry witness that exercises THAT new flag. let session_ctx = Arc::new(SessionContext::new()); let planner = PhysicalPlanner::new(Arc::clone(&session_ctx), 0); let ctx = CorePlannerContext { planner: &planner }; diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 8aa4e4bd5a..14efeddeae 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -136,8 +136,15 @@ case class CometScanRule(session: SparkSession) // Contribs' `preTransform` MUST only rewrite scans they recognise; this guard catches // the most dangerous violation (a contrib stripping or substituting an unrelated // format's scan) regardless of whether the replacement keeps the same SparkPlan - // class. Light overhead (one collect per extension + one identity-Set check); only - // fires a warning when the contract is broken. + // class. Plan-tree reordering is tolerated -- we only care that the scan still + // exists in the tree, not where. + // + // Cost: O(K * (P + S)) where K = scanExtensions.size, P = plan node count, + // S = unclaimed-scan count. For typical K=1..3 and S small, this is negligible. + // + // V2 scope: V2 BatchScanExecs are NOT inspected. preTransform is documented V1-only + // (see CometScanRuleExtension.preTransform); V2 wrapper-stripping happens per-scan + // inside `transformV2` and doesn't have the same tree-level corruption surface. val prepped = if (!CometConf.COMET_NATIVE_SCAN_ENABLED.get(conf)) { plan @@ -148,12 +155,13 @@ case class CometScanRule(session: SparkSession) } val after = ext.preTransform(p, session) if (unclaimedBefore.nonEmpty) { - // Identity-equality check (reference compare) -- detects removal or - // substitution of a scan the extension doesn't own, including replacements - // whose SparkPlan class differs from the original. Plan-tree reordering is - // tolerated (we don't care WHERE the scan ended up, only that it still - // exists in the tree). - val survivors = scala.collection.mutable.Set.empty[FileSourceScanExec] + // IDENTITY semantics, NOT value-equality: Spark case classes (including + // FileSourceScanExec) compare equal when their fields match, so a self-join + // with two reads against the same table after AQE deduplication can produce + // two value-equal-but-reference-distinct scans. A standard mutable.Set would + // collapse them and we'd emit a false-positive warning. Use a Vector + + // `_ eq b` scan instead -- the survivor list is small in practice. + val survivors = scala.collection.mutable.ArrayBuffer.empty[FileSourceScanExec] after.foreach { case s: FileSourceScanExec => survivors += s case _ => diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index cf9ba25525..bd214159ef 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -52,28 +52,38 @@ object CometExtensionRegistry extends Logging { * Discover contrib extensions on the classpath. Idempotent. Safe to call from multiple threads * (only the first call performs discovery). */ - def load(): Unit = { - if (loaded.compareAndSet(false, true)) { - scanExts = loadOne[CometScanRuleExtension]("CometScanRuleExtension") - serdeExts = loadOne[CometOperatorSerdeExtension]("CometOperatorSerdeExtension") - if (scanExts.nonEmpty || serdeExts.nonEmpty) { - logInfo( - s"Comet contrib extensions loaded: " + - s"scan=[${scanExts.map(_.name).mkString(", ")}], " + - s"serde=[${serdeExts.map(_.name).mkString(", ")}]") - detectDuplicateSerdeClasses(serdeExts) - } else { - // Positive signal that discovery ran. Some Spark deploy modes (Ivy `--packages`, - // isolated UDF classloaders) put Comet on a classloader that the TCCL fallback - // doesn't see; absent extensions go silent without this line. - logInfo( - "Comet contrib extensions: none discovered on classpath " + - "(no META-INF/services entries for CometScanRuleExtension or " + - "CometOperatorSerdeExtension)") - } - // Build the merged exec map once at load time. CometExecRule reads it on every - // operator transform; rebuilding per-call would be wasteful. - mergedSerdesCache = serdeExts.flatMap(_.serdes).toMap + def load(): Unit = synchronized { + // `synchronized` (not just compareAndSet) so that concurrent callers wait for the + // first thread's writes to `scanExts` / `serdeExts` / `mergedSerdesCache` to publish + // before they return. The previous AtomicBoolean-only gate allowed thread B to + // observe `loaded=true` and read `Seq.empty` while thread A was still mid-loadOne. + // CometScanRule._apply and CometExecRule._apply both call this on first invocation, + // and AQE can run them concurrently across sub-queries, so the race is reachable. + if (loaded.get()) return + val newScanExts = loadOne[CometScanRuleExtension]("CometScanRuleExtension") + val newSerdeExts = loadOne[CometOperatorSerdeExtension]("CometOperatorSerdeExtension") + val newMerged = newSerdeExts.flatMap(_.serdes).toMap + // Publish the @volatile fields BEFORE flipping `loaded` so other threads either see + // the empty defaults (and may re-enter -- benign, blocked by the monitor) or the + // fully-populated state (and may skip -- also benign). + scanExts = newScanExts + serdeExts = newSerdeExts + mergedSerdesCache = newMerged + loaded.set(true) + if (newScanExts.nonEmpty || newSerdeExts.nonEmpty) { + logInfo( + s"Comet contrib extensions loaded: " + + s"scan=[${newScanExts.map(_.name).mkString(", ")}], " + + s"serde=[${newSerdeExts.map(_.name).mkString(", ")}]") + detectDuplicateSerdeClasses(newSerdeExts) + } else { + // Positive signal that discovery ran. Some Spark deploy modes (Ivy `--packages`, + // isolated UDF classloaders) put Comet on a classloader that the TCCL fallback + // doesn't see; absent extensions go silent without this line. + logInfo( + "Comet contrib extensions: none discovered on classpath " + + "(no META-INF/services entries for CometScanRuleExtension or " + + "CometOperatorSerdeExtension)") } } @@ -129,8 +139,13 @@ object CometExtensionRegistry extends Logging { /** * Test-only: reset the registry to the empty state. Lets unit tests re-run discovery with a * different classpath / overridden services. Not for production use. + * + * Visibility is `public` (rather than `private[comet]`) because contribs are not required to + * be packaged under `org.apache.comet.*`; a contrib living under e.g. `io.delta.comet.contrib` + * must still be able to reset between tests. The method's name carries the "test-only" + * contract by convention. */ - private[comet] def resetForTesting(): Unit = { + def resetForTesting(): Unit = { loaded.set(false) scanExts = Seq.empty serdeExts = Seq.empty From 6652963cbd6c88de9db4218c96514eff1b2a85f6 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 14:02:28 -0400 Subject: [PATCH 14/27] feat(contrib): fourth-pass polish (F1-F6) - F1: CometScanRule preTransform corruption guard swaps ArrayBuffer + `_ eq` for java.util.IdentityHashMap, making survivor lookup O(1) and the documented O(K * (P + S)) cost accurate. - F2: CometExtensionRegistry.resetForTesting() now `synchronized`. Without it a concurrent load() could observe torn state (loaded=false but the fields still populated, or vice versa), causing the next load() to short-circuit and miss re-discovery. - F3: Trimmed overstated comment in load()'s no-extensions branch. - F4: ContribOp dispatcher rejects whitespace-only `kind` (not just empty); displays the raw `kind` repr in the error message. - F5: ContribOp proto reserves tags 3-9 for additive evolution (payload_format_version, compression, contrib_version, etc.) so evolving contribs can't accidentally reuse one. - F6: Contributor guide documents the 16 MiB ContribOp.payload cap and notes contribs with a legitimate need for a higher ceiling should file an issue rather than work around it. Also adds a "MUST NOT call load() from a class's static initializer" note to the load() docstring (Scala monitors are reentrant so it wouldn't deadlock but would shadow the in-flight publication). Verified: cargo check green, 21 core planner tests pass, 10 SPI + example tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../contributor-guide/contrib-extensions.md | 11 +++++++++++ native/core/src/execution/planner.rs | 12 ++++++------ native/proto/src/proto/operator.proto | 3 +++ .../org/apache/comet/rules/CometScanRule.scala | 14 ++++++++------ .../comet/spi/CometExtensionRegistry.scala | 16 ++++++++++++---- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index 7061069f73..9330b9da4d 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -280,6 +280,17 @@ Lookups happen once per `ContribOp` plan call; writes happen only during library The implementation may switch to a lock-free primitive (`ArcSwap`) in a future release if profiling shows the read path matters; the public API stays unchanged either way. +## Payload size cap + +The native dispatcher enforces a hard ceiling of **16 MiB** on `ContribOp.payload`. A +malformed JVM-side serde (or one that accidentally accumulates state across plan calls) +producing a larger payload is rejected with a clear error message before the contrib's +`plan()` runs. The cap is intentionally above any plausible file-scan payload (Delta +with ~100k tasks weighs in around 3–4 MiB) and well below "heap pressure" territory; +the value is hardcoded in `native/core/src/execution/planner.rs`. If your contrib has +a legitimate need for a larger payload, file an issue with the size you need and the +use case -- the cap is a guardrail, not a feature. + ## Testing `contrib/example/`'s test suite demonstrates the recommended pattern: diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index f8848042f5..b3b18e9f75 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -1971,12 +1971,12 @@ impl PhysicalPlanner { lookup_contrib_planner_by_kind, CorePlannerContext, }; let kind = contrib_op.kind.as_str(); - if kind.is_empty() { - return Err(GeneralError( - "ContribOp.kind is empty -- the JVM-side serde produced a malformed \ - envelope (every contrib must set a stable kind string)" - .into(), - )); + if kind.trim().is_empty() { + return Err(GeneralError(format!( + "ContribOp.kind={kind:?} is empty or whitespace -- the JVM-side \ + serde produced a malformed envelope (every contrib must set a \ + stable kind string)" + ))); } // Look up the planner first so a bogus kind produces the "not registered" diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto index da6dad9f74..9e1f1b1767 100644 --- a/native/proto/src/proto/operator.proto +++ b/native/proto/src/proto/operator.proto @@ -76,6 +76,9 @@ message ContribOp { string kind = 1; // Contrib-private payload bytes. Format defined by the contrib's own proto schema. bytes payload = 2; + // Reserve tags for future additive evolution (e.g. payload_format_version, compression, + // contrib_version) without risking accidental tag reuse by an evolving contrib. + reserved 3 to 9; } message SparkPartitionedFile { diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 14efeddeae..52621bdc8a 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -140,7 +140,8 @@ case class CometScanRule(session: SparkSession) // exists in the tree, not where. // // Cost: O(K * (P + S)) where K = scanExtensions.size, P = plan node count, - // S = unclaimed-scan count. For typical K=1..3 and S small, this is negligible. + // S = scan count. IdentityHashMap gives O(1) survivor lookup; the dominant term + // is the tree traversals. For typical K=1..3 this is negligible. // // V2 scope: V2 BatchScanExecs are NOT inspected. preTransform is documented V1-only // (see CometScanRuleExtension.preTransform); V2 wrapper-stripping happens per-scan @@ -159,15 +160,16 @@ case class CometScanRule(session: SparkSession) // FileSourceScanExec) compare equal when their fields match, so a self-join // with two reads against the same table after AQE deduplication can produce // two value-equal-but-reference-distinct scans. A standard mutable.Set would - // collapse them and we'd emit a false-positive warning. Use a Vector + - // `_ eq b` scan instead -- the survivor list is small in practice. - val survivors = scala.collection.mutable.ArrayBuffer.empty[FileSourceScanExec] + // collapse them and we'd emit a false-positive warning. IdentityHashMap + // gives us O(1) lookup with reference-equality semantics. + val survivors = + new java.util.IdentityHashMap[FileSourceScanExec, java.lang.Boolean]() after.foreach { - case s: FileSourceScanExec => survivors += s + case s: FileSourceScanExec => survivors.put(s, java.lang.Boolean.TRUE) case _ => } unclaimedBefore.foreach { b => - if (!survivors.exists(_ eq b)) { + if (!survivors.containsKey(b)) { logWarning( s"CometScanRuleExtension '${ext.name}'.preTransform removed or " + s"replaced a FileSourceScanExec it does not claim " + diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index bd214159ef..66a8861e59 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -59,6 +59,11 @@ object CometExtensionRegistry extends Logging { // observe `loaded=true` and read `Seq.empty` while thread A was still mid-loadOne. // CometScanRule._apply and CometExecRule._apply both call this on first invocation, // and AQE can run them concurrently across sub-queries, so the race is reachable. + // + // Contribs MUST NOT call `load()` from a `#[ctor]`-equivalent (JVM-side: a class's + // static initializer or trait's `object` init) -- Scala monitors are reentrant so + // re-entry won't deadlock, but the inner call would observe the partially-built + // state and re-trigger `loadOne`, shadowing the in-flight publication. if (loaded.get()) return val newScanExts = loadOne[CometScanRuleExtension]("CometScanRuleExtension") val newSerdeExts = loadOne[CometOperatorSerdeExtension]("CometOperatorSerdeExtension") @@ -77,9 +82,9 @@ object CometExtensionRegistry extends Logging { s"serde=[${newSerdeExts.map(_.name).mkString(", ")}]") detectDuplicateSerdeClasses(newSerdeExts) } else { - // Positive signal that discovery ran. Some Spark deploy modes (Ivy `--packages`, - // isolated UDF classloaders) put Comet on a classloader that the TCCL fallback - // doesn't see; absent extensions go silent without this line. + // Positive signal that discovery ran. Without this line a user with a misconfigured + // contrib JAR (missing META-INF/services, or the JAR not on any classloader Comet + // can see) gets no diagnostic and silently loses contrib functionality. logInfo( "Comet contrib extensions: none discovered on classpath " + "(no META-INF/services entries for CometScanRuleExtension or " + @@ -145,7 +150,10 @@ object CometExtensionRegistry extends Logging { * must still be able to reset between tests. The method's name carries the "test-only" * contract by convention. */ - def resetForTesting(): Unit = { + def resetForTesting(): Unit = synchronized { + // synchronized so concurrent `load()` callers don't observe torn state -- e.g. + // `loaded=false` with `scanExts` still populated, which would let a subsequent + // `load()` short-circuit on the AtomicBoolean and never re-discover. loaded.set(false) scanExts = Seq.empty serdeExts = Seq.empty From 91c40e0accaa86eb0a8d4463acc5afe0fc2d2dcc Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 14:27:32 -0400 Subject: [PATCH 15/27] docs(contrib): comprehensive contributor-guide rewrite Addresses every gap surfaced by the doc-completeness validation pass: Missing topics now covered - JVM-side proto compilation (protoc-jar-maven-plugin block + shaded protobuf-java rationale -- contribs MUST inherit the parent pom's com.google.protobuf -> ${comet.shade.packageName}.protobuf relocation). - Worked Scala snippet for building a ContribOp envelope from a serde, including the Java setContribOp(...) name (vs Rust op_struct). - CometOperatorSerde[T <: SparkPlan] trait shape: enabledConfig, requiresNativeChildren, getSupportLevel, convert, createExec. - Full walked-through plan() body exercising every ContribPlannerContext method (convert_spark_schema, build_physical_expr, prepare_object_store, build_parquet_datasource_exec) -- mirrors what Delta/Iceberg ports do. - ServiceLoader diagnostics: the INFO "none discovered" line, the WARN per-failed-entry line, which logger to enable for debugging. - Classloader-order story (lazy load post-`--jars` so order doesn't matter). - CometExtensionRegistry.load() MUST NOT be called from static initializers (reentrancy shadows in-flight publication). - Logging conventions (eprintln in #[ctor], log::* with target: elsewhere, do NOT re-prefix errors with the contrib's kind). - Error message convention (dispatcher already prefixes with kind). - Version pinning for out-of-tree contribs (explicit Comet patch version, not ${project.version}). - Multi-Spark-version shimming: pick a spark.version.short profile, mirror Comet's per-profile artifact ID pattern. - End-to-end Rust+Scala round-trip test pattern with concrete recipe. - Cargo feature canary maintenance note (when adding contrib-, extend the not(any(...)) cfg in production_build_has_no_contrib_planners_registered). Reorganised so a new contrib author finds things in the right order: - "Required files" + "Wiring into core" + Cargo feature gate moved BEFORE the SPI deep-dive. - Prerequisites + .gitignore + workspace-placement constraint called out upfront. Inaccuracies fixed - Operator proto field name (op_struct in Rust, setContribOp on Java Builder -- explained as a code-generator language difference). - "open for inheritance" qualifier sharpened: additive default-implemented methods are a minor bump; abstract-method additions are breaking. - out_dir = "src/generated" pattern justified as a deliberate deviation from idiomatic prost (stable include! path for editor tooling). - contrib-example,contrib-delta example reworded so it doesn't reference a feature that doesn't exist in-tree yet. - PR1's CI -> Comet's CI. - MAX_CONTRIB_PAYLOAD_BYTES named so readers can rg for it. Nit cleanups - _clear_for_test added to the SPI table with explicit "test escape hatch only" caveat alongside ScopedContribPlannerRegistration. - ContribError convention paragraph cross-linked from the SPI table row. The result is a 758-line single-document reference that a contrib author can follow end-to-end without reading core's source. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../contributor-guide/contrib-extensions.md | 775 ++++++++++++++---- 1 file changed, 607 insertions(+), 168 deletions(-) diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index 9330b9da4d..c5183f8730 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -20,36 +20,27 @@ under the License. # Authoring a Comet contrib extension A Comet *contrib* is a self-contained extension that lives alongside core but ships -independently. Contribs add support for a specific table format or operator class without -core having to know about them at build time. The first contrib in the tree is -[`contrib/example/`](https://github.com/apache/datafusion-comet/tree/main/contrib/example); -read it top-to-bottom as the worked reference, then come back here for the architectural -context. +independently. Contribs add support for a specific table format or operator class +without core having to know about them at build time. -This document covers how the SPI is shaped, which integration points are available, and -the concrete files a new contrib has to ship. - -## SPI stability - -The contrib SPI surface is currently **alpha** — minor versions may carry breaking -changes during the early-adopter period. Public types in `comet-contrib-spi` and the -Scala SPI traits are marked `#[non_exhaustive]` (or open for inheritance) so additive -changes are minor bumps. Removals and renames will be called out in release notes. Lock -your contrib to a specific Comet patch version until the SPI is declared stable in a -later release. +The first contrib in the tree is +[`contrib/example/`](https://github.com/apache/datafusion-comet/tree/main/contrib/example) — +read it top-to-bottom as the worked reference. This guide adds the architectural context +and walks through every integration point that the example does not exercise. ## Architecture at a glance Each contrib has two halves that ship as separate artifacts but are wired together at build time: -- **JVM half** — a separate Maven JAR (`comet-contrib--spark${spark.version.short}_${scala.binary.version}`) - containing Scala / Java extension classes. Discovered at runtime via - `java.util.ServiceLoader` from the contrib JAR's `META-INF/services/` entries. - +- **JVM half** — a separate Maven JAR + (`comet-contrib--spark${spark.version.short}_${scala.binary.version}`) containing + Scala/Java extension classes plus contrib-private generated proto classes. Discovered + at runtime via `java.util.ServiceLoader` from the contrib JAR's `META-INF/services/` + entries. - **Native half** — a Rust `rlib` crate (NOT `cdylib`) that is **linked INTO core's - `libcomet`** at build time when the matching Cargo feature on core is enabled. There is - exactly one Comet native library at runtime; the contrib's `#[ctor]` registers its + `libcomet`** at build time when the matching Cargo feature on core is enabled. There + is exactly one Comet native library at runtime; the contrib's `#[ctor]` registers its operator planners during library load. The wire format between JVM and native uses a single generic envelope on the operator @@ -57,165 +48,521 @@ proto, `ContribOp { kind, payload }`. Core's planner dispatches by `kind`; the c native crate registers planners against the same `kind` string the contrib's JVM code writes into the proto. -## SPI surface +## Required files (mirror `contrib/example/` exactly) -### JVM side: `org.apache.comet.spi` +``` +contrib// + pom.xml ← Maven module + src/main/scala/org/apache/comet/contrib// + .scala ← CometScanRuleExtension / CometOperatorSerdeExtension impl + src/main/resources/META-INF/services/ + org.apache.comet.spi.CometScanRuleExtension ← one line per extension class + org.apache.comet.spi.CometOperatorSerdeExtension ← (only if you implement serdes) + src/test/scala/org/apache/comet/contrib// + Suite.scala ← integration test + native/ + Cargo.toml ← rlib crate, workspace = "../../../native" + build.rs ← runs prost-build over your proto schema + src/lib.rs ← ContribOperatorPlanner impl + #[ctor] registration + src/proto/.proto ← contrib-private proto schema, your own package + src/generated/ ← (gitignored) prost-build output +``` -| Trait / Object | Purpose | -|---|---| -| `CometScanRuleExtension` | Intercept scan-tree transformation. Override `preTransform` for tree-level rewrites (V1 only — see below); `matchesV1` / `transformV1` for V1 `FileSourceScanExec`; `matchesV2` / `transformV2` for V2 `BatchScanExec`. Dispatch iterates registered extensions in order; the first one whose `match*` returns `true` AND `transform*` returns `Some` wins. `None` means "decline this instance" and dispatch continues to the next matching extension before falling back to core. | -| `CometOperatorSerdeExtension` | Contribute additional `SparkPlan` class → `CometOperatorSerde` mappings to `CometExecRule`. The merged map is computed once at registry load time. Used when the contrib has its own physical operator (e.g., a contrib-specific scan exec) that needs native serialization. Duplicate class keys across contribs are logged as a warning at load. | -| `CometExtensionRegistry` | Process-wide singleton. `load()` is invoked lazily from `CometScanRule._apply` / `CometExecRule.apply` the first time Comet runs against a Comet-enabled session — so Spark sessions that never enable Comet pay zero ServiceLoader cost. Subsequent calls are no-ops. Test-only `resetForTesting()` exists for unit tests that need a clean registry. | +Plus three edits to existing files (collected under "Wiring into core", below). + +### Prerequisites + +You need: + +- The same toolchain Comet's main build uses: JDK 11+ (Maven build), Rust stable, `protoc` + (pulled in automatically by `protoc-jar-maven-plugin` and `prost-build`). +- The contrib's `` decided in advance — it becomes a Cargo feature flag + (`contrib-`), an artifact ID, a JNI symbol prefix if your contrib calls into its + own Rust, and a `kind` string component for every `ContribOp`. Choose a short, stable + identifier; renames are breaking. + +### `.gitignore` + +The generated proto outputs are checked in nowhere: + +- `contrib//native/src/generated/` — Rust prost output. The example contrib's + `.gitignore` entry is the template. +- `contrib//target/` — Maven build output (inherits from the repo-root `.gitignore`). + +### Workspace placement constraint + +`contrib//native/Cargo.toml` uses `workspace = "../../../native"`. This relative +path assumes contribs live exactly at `/contrib//native`. Deeper nesting +breaks the workspace lookup; place the contrib at the documented depth. + +## Wiring into core + +Three single-line edits to existing files: + +1. **Root `pom.xml`** — add `contrib/` under the existing + `` block so `mvn install` builds the contrib JAR. +2. **`native/Cargo.toml`** — add `../contrib//native` to the workspace `members` + list (NOT `default-members` — contribs are consumed via core's feature flags). +3. **`native/core/Cargo.toml`** — add a `contrib-` feature gate and a matching + optional `dep:` entry, mirroring the `contrib-example` lines: + + ```toml + [dependencies] + comet-contrib- = { path = "../../contrib//native", optional = true } + + [features] + contrib- = ["dep:comet-contrib-"] + ``` -### `preTransform` is V1-only and disabled when scan is off + Do **not** add the feature to `default = [...]`. Production builds carry zero contrib + surface by design; users opt in explicitly. (CI matrix builds should add the feature.) -`CometScanRule` folds every registered extension's `preTransform` over the plan tree -once, before per-scan dispatch begins. The rewritten subtree is what `transformV1` -receives. `transformV2` does **not** receive a plan reference — V2 contribs that need -wrapper-stripping must do that work inside `transformV2` against `scanExec.scan` and -`scanExec.children` directly. +4. **`native/core/src/lib.rs`** — add the matching feature-gated `extern crate` so the + contrib's `#[ctor]` is linked in when the feature is on: -The fold is skipped entirely when `spark.comet.scan.enabled=false`. A contrib's own -Catalyst wrappers (Delta's DV filter, etc.) become load-bearing when Comet's scan is -disabled; stripping them turns into a correctness bug. + ```rust + #[cfg(feature = "contrib-")] + extern crate comet_contrib_; + ``` -`CometScanRule` also logs a warning when a `FileSourceScanExec` is replaced by an -extension whose `matchesV1` returns false against the original scan's relation — a -contrib that trips this warning is rewriting scans it doesn't recognise and may corrupt -other formats' plans. Narrow your pattern match. +## Cargo feature gate + +```bash +# Default release build: zero contrib surface. registered_contrib_kinds() is empty. +cargo build + +# Enable a specific contrib explicitly: +cargo build --features contrib-example +# Multiple at once: +cargo build --features 'contrib-example contrib-' + +# Verify the slim build path: +cargo build --no-default-features +``` + +A core test under `#[cfg(not(any(feature = "contrib-example", ...)))]` asserts +`registered_contrib_kinds()` is empty in the slim build. When you add a new +`contrib-` feature, **extend that test's `cfg` predicate** (see +`native/core/src/execution/planner/contrib.rs`'s `production_build_has_no_contrib_planners_registered`) +so the canary still compiles on your contrib's CI row. + +The JVM side is **always** conditional: the contrib JAR is its own Maven artifact, and +Spark only loads it when it's on the classpath. Even with the Cargo feature on, a user +who doesn't add the contrib JAR sees no behaviour change — the contrib's native planner +sits dormant in the registry, waiting for a JVM serde that never calls it. -### Convention: define your own SparkPlan subclass for serde dispatch +## SPI stability -`CometExecRule` dispatches by **class identity** (`op.getClass`) when matching an -operator to its serde. Contribs that need a custom executor (e.g., a contrib-specific -scan exec carrying contrib-private state) should define a dedicated subclass: +The contrib SPI is currently **alpha** — minor Comet versions may carry breaking +changes during the early-adopter period. Concretely: + +- `comet-contrib-spi` is workspace-versioned alongside core. A contrib built against + Comet `0.17.x` is **not** guaranteed to work with Comet `0.18.x` at runtime; the SPI + traits may evolve. Pin your contrib's `` and `comet-spark` dependency to a + specific Comet patch version. +- `ParquetDatasourceParams` and `ContribError` are `#[non_exhaustive]` so additive + changes (new fields / variants) are minor bumps, not breaks. Use + `ParquetDatasourceParams::new(...)` + `with_*` setters rather than struct-literal + syntax; consumers of `ContribError` must include a wildcard match arm. +- Scala SPI traits add new methods with default implementations (default `false` / + `None`). Override only the methods you need; an additive method change is a minor + bump. Abstract-method additions are breaking and called out in release notes. +- Releases that change the SPI in a breaking way will say so explicitly. + +## SPI surface + +### JVM side: `org.apache.comet.spi` + +| Trait / Object | Purpose | +|---|---| +| `CometScanRuleExtension` | Intercept scan-tree transformation. See subsections below. | +| `CometOperatorSerdeExtension` | Contribute additional `SparkPlan` class → `CometOperatorSerde` mappings to `CometExecRule`. See subsections below. | +| `CometExtensionRegistry` | Process-wide singleton. `load()` is invoked lazily from `CometScanRule._apply` / `CometExecRule._apply` the first time Comet runs against a Comet-enabled session — sessions that never enable Comet pay zero ServiceLoader cost. Subsequent calls are no-ops. `resetForTesting()` (public) clears the registry between tests. | + +#### `CometScanRuleExtension` + +- `name: String` — human label used in logs and warnings. +- `preTransform(plan, session): SparkPlan` (default identity) — tree-level pre-pass run + once per plan before per-scan dispatch. **V1-only.** Use it to undo wrapper rewrites + applied by your format's own Catalyst strategy (Delta's `PreprocessTableWithDVs` is + the canonical case). Skipped entirely when `spark.comet.scan.enabled=false` — your + wrappers become load-bearing in that mode and stripping them would be a correctness + bug. `CometScanRule` logs a warning when an extension replaces a `FileSourceScanExec` + whose relation it does not claim; this catches accidental cross-format corruption. +- `matchesV1(relation): Boolean` (default `false`) / `transformV1(plan, scanExec, session): Option[SparkPlan]` + — V1 dispatch. Make `matchesV1` cheap (typically a file-format class probe). +- `matchesV2(scanExec): Boolean` (default `false`) / `transformV2(scanExec, session): Option[SparkPlan]` + — V2 dispatch. Unlike V1, `transformV2` does **not** receive a plan-tree reference; + any wrapper-stripping a V2 contrib needs must happen against `scanExec.scan` / + `scanExec.children` directly. + +Dispatch iterates registered extensions in registration order; the first one whose +`match*` returns `true` AND `transform*` returns `Some` wins. `None` from +`transform*` is treated as "decline this instance" and dispatch continues to the next +matching extension before falling back to core. + +Pass state from `preTransform` to `transformV1` via Spark's `TreeNodeTag` mechanism — +do NOT use external mutable state, which leaks across plan invocations. + +#### `CometOperatorSerdeExtension` ```scala -case class CometMyFormatScanExec(...) extends CometScanExec(..., SCAN_NATIVE_DELTA_COMPAT) +trait CometOperatorSerdeExtension { + def name: String + def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] +} ``` -and register the serde keyed on the new class: +Contribs that need a custom physical operator (e.g., a contrib-specific scan exec +carrying contrib-private state) define their own `SparkPlan` subclass and register a +serde keyed on the new class: ```scala +case class CometMyFormatScanExec(...) extends CometNativeExec { /* ... */ } + class MyFormatSerdeExtension extends CometOperatorSerdeExtension { + override def name: String = "myformat" override def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] = Map(classOf[CometMyFormatScanExec] -> CometMyFormatScanSerde) } ``` -Avoid relying on the legacy `scanImpl: String` tag pattern on a generic `CometScanExec`; -that approach has no analogue in the SPI's class-based dispatch and would require core -changes to support. +The merged map across all extensions is computed once at registry load time; +`CometExecRule` consults it via `.get(op.getClass)`. Duplicate class keys across +contribs are logged as a warning at load — the convention is **one contrib defines a +class, that contrib owns its serde**. + +Avoid relying on the legacy `scanImpl: String` tag pattern on a generic `CometScanExec` +— the SPI dispatches by class, not by tag. + +##### `CometOperatorSerde[T <: SparkPlan]` contract + +The serde itself lives in `org.apache.comet.serde.CometOperatorSerde` (not in the `spi` +package). Implement four members: + +```scala +class CometMyFormatScanSerde extends CometOperatorSerde[CometMyFormatScanExec] { + override def enabledConfig: Option[ConfigEntry[Boolean]] = + Some(CometConf.COMET_MYFORMAT_ENABLED) + + override def requiresNativeChildren: Boolean = false + + override def getSupportLevel(op: CometMyFormatScanExec): SupportLevel = + Compatible(None) + + override def convert( + op: CometMyFormatScanExec, + builder: Operator.Builder, + childOp: Operator*): Option[Operator] = { + // Build your contrib-private payload message and wrap in ContribOp. + // See "Building a ContribOp envelope" below. + Some(builder + .setContribOp(ContribOp.newBuilder() + .setKind("myformat-scan") + .setPayload(myPayload.toByteString)) + .build()) + } + + override def createExec(nativeOp: Operator, op: CometMyFormatScanExec): CometNativeExec = + new CometMyFormatScanExec(nativeOp, op.output, op.child, /* ... */) +} +``` + +`convert` MUST return `Some(builder.setContribOp(...).build())` for the dispatch to +reach your native planner; returning `None` falls the operator back to Spark. ### Native side: `comet-contrib-spi` crate | Item | Purpose | |---|---| -| `trait ContribOperatorPlanner` | Implemented by the contrib's native crate. The `plan(ctx, payload, children) -> Arc` method receives a `&dyn ContribPlannerContext` (handle to core's planner services), the contrib-private payload bytes from the `ContribOp` envelope, and the already-built native children. | -| `trait ContribPlannerContext` | Implemented by core. Exposes the parquet exec builder (`build_parquet_datasource_exec`), expression planner (`build_physical_expr`), schema conversion (`convert_spark_schema`), object-store registration (`prepare_object_store`), and the `SessionContext` itself. Contribs reach into core through this trait rather than depending on `datafusion-comet` directly. | -| `struct ParquetDatasourceParams` | `#[non_exhaustive]` argument bundle for the parquet exec builder. Construct via `ParquetDatasourceParams::new(required_schema, object_store_url, file_groups)` and chain `with_*` setters. Adding fields in future is a minor SemVer bump. | +| `trait ContribOperatorPlanner` | Implemented by the contrib's native crate. `plan(ctx, payload, children) -> Arc` receives a `&dyn ContribPlannerContext` (handle to core's planner services), the contrib-private payload bytes, and the already-built native children. | +| `trait ContribPlannerContext` | Implemented by core. Exposes the parquet exec builder, expression planner, schema conversion, object-store registration, and the `SessionContext` itself. Contribs reach into core through this trait rather than depending on `datafusion-comet` directly. | +| `struct ParquetDatasourceParams` | `#[non_exhaustive]` argument bundle for the parquet exec builder. Construct via `ParquetDatasourceParams::new(required_schema, object_store_url, file_groups)` and chain `with_*` setters. | | `register_contrib_planner(kind, planner)` | Process-wide registry. Called from the contrib's `#[ctor::ctor]` at library load. | | `lookup_contrib_planner_by_kind(kind)` | Used by core's planner; contribs rarely call directly. | -| `ContribError` | `#[non_exhaustive]` minimal error type. Core converts to its own `ExecutionError` at the dispatch site. Variants: `Plan(String)`, `BadPayload(String)`, `WrongChildCount { expected: String, actual: usize }`. Pattern matches MUST include a wildcard arm so future variants don't break consumers. | -| `ScopedContribPlannerRegistration` | `#[cfg(any(test, feature = "test-utils"))]` RAII guard for tests that register a planner without polluting the global registry. Drop restores the previous planner. Pair with `#[serial_test::serial]` if your test asserts on `registered_contrib_kinds()`. | +| `registered_contrib_kinds()` | Diagnostic snapshot of registered kinds. | +| `ContribError` | `#[non_exhaustive]` error type. Variants: `Plan(String)`, `BadPayload(String)`, `WrongChildCount { expected: String, actual: usize }`. Pattern matches MUST include a wildcard arm. | +| `ScopedContribPlannerRegistration` | (`#[cfg(any(test, feature = "test-utils"))]`) RAII guard that registers a planner for the lifetime of the guard and removes it on drop. Use in unit tests that exercise dispatch without polluting the global registry. | +| `_clear_for_test()` | (`#[cfg(any(test, feature = "test-utils"))]`) Wipes the registry entirely. **Test escape hatch only** — using it in parallel with other registry consumers is unsafe; prefer `ScopedContribPlannerRegistration`. | -The SPI crate is intentionally a thin leaf: it depends only on `datafusion`, -`datafusion-comet-proto`, and `object_store`. This is what breaks the would-be cyclic -dependency (core links contribs via Cargo feature flags; contribs need the SPI types — -both depend on a third leaf crate instead of each other). No core-typed values cross -the trait boundary. +The SPI crate depends only on `datafusion`, `datafusion-comet-proto`, and +`object_store`. Core links contribs via Cargo feature flags; contribs depend on the SPI +crate; nothing depends back on core from a contrib — the dependency graph is a DAG. -### Why `ContribOperatorPlanner` is `Send + Sync` but `ContribPlannerContext` isn't +#### Why `ContribOperatorPlanner` is `Send + Sync` but `ContribPlannerContext` isn't The planner trait is stored in an `Arc` inside a process-wide registry shared across threads, so `Send + Sync` is load-bearing. The context is short-lived: a `&dyn` reference passed for the duration of one synchronous `plan()` call, so the bound would -only restrict implementations without adding safety. Notably, core's `PhysicalPlanner` -carries JNI handles that aren't `Send`; requiring `Send` on the context would force an -awkward `Arc>` dance for no gain. +only restrict implementations without adding safety. Core's `PhysicalPlanner` carries +JNI handles that aren't `Send`; requiring it would force an `Arc>` dance +for no gain. Contribs that want to spawn async work during `plan()` must capture only the `Arc` (which **is** `Send + Sync`) before crossing a thread boundary — not the `&dyn ContribPlannerContext` itself. -### Why `payload: &[u8]` instead of `Bytes` +#### Why `payload: &[u8]` instead of `Bytes` The dispatcher already owns the decoded `ContribOp` proto; passing `&[u8]` is zero-copy and avoids forcing every contrib to depend on the `bytes` crate. `prost::Message::decode` accepts `&[u8]` directly. Contribs that want `Bytes` for downstream zero-copy work can -convert with `bytes::Bytes::copy_from_slice(payload)` — a single allocation, at most -once per plan call. +convert via `bytes::Bytes::copy_from_slice(payload)` — one allocation, once per plan +call. -### `ContribError::WrongChildCount` convention +#### `ContribError::WrongChildCount` convention `expected` is a free-form human description; conventionally a phrase like `"exactly 1"` -or `"0 or 1"` so the displayed error reads: +or `"0 or 1"`. The dispatcher displays: `wrong child count: expected exactly 1, got 2`. -## Required files (mirror `contrib/example/` exactly) +#### Error message convention +The dispatcher wraps every `ContribError` with `format!("contrib planner {kind:?}: {e}")`, +so contribs should NOT re-prefix their messages with their own `kind`. Write: + +```rust +ContribError::Plan(format!("file not found: {path}")) ``` -contrib// - pom.xml ← Maven module - src/main/scala/org/apache/comet/contrib// - .scala ← CometScanRuleExtension / CometOperatorSerdeExtension impl - src/main/resources/META-INF/services/ - org.apache.comet.spi.CometScanRuleExtension ← one line per extension class - org.apache.comet.spi.CometOperatorSerdeExtension ← (only if you implement serdes) - src/test/scala/org/apache/comet/contrib// - Suite.scala ← integration test - native/ - Cargo.toml ← rlib crate, workspace = "../../../native" - build.rs ← runs prost-build over your proto schema - src/lib.rs ← ContribOperatorPlanner impl + #[ctor] registration - src/proto/.proto ← contrib-private proto schema, your own package - src/generated/ ← (gitignored) prost-build output + +not: + +```rust +ContribError::Plan(format!("myformat-scan: file not found: {path}")) // double prefix ``` -### Proto layer +## Proto layer Each contrib carries its own `.proto` schema defining the message its `ContribOp.payload` -carries. The Scala side serializes that message and sets it on the operator proto's -`contrib_op` envelope; the Rust side `prost::Message::decode`s the same bytes back. -`contrib/example/`'s `ExampleConstantScan { row_count }` is the trivial reference. +carries. Both halves of the contrib generate code from the same `.proto` source: + +- **Rust**, in the contrib's `build.rs` via `prost-build`. +- **Java**, in the contrib's `pom.xml` via `protoc-jar-maven-plugin`. Use your own proto **package name** (e.g., `comet.contrib.`) so symbols never -collide with core or with other contribs. Add `contrib//native/src/generated/` to -the repository `.gitignore` (the build script writes generated `.rs` there each compile). +collide with core or with other contribs. Add `contrib//native/src/generated/` +to `.gitignore`. -Plus three edits to existing files: +### Proto, native side -- **Root `pom.xml`** — add `contrib/` so `mvn install` builds the - contrib. -- **`native/Cargo.toml`** — add `../contrib//native` to the workspace `members` - list (NOT `default-members` — contribs are consumed via core's feature flags). -- **`native/core/Cargo.toml`** — add a `contrib-` feature gate and a matching - optional `dep:` entry. Add the feature to `default = [...]` if you want it on by - default in release builds. +`contrib/example/native/build.rs` is the template: + +```rust +fn main() -> std::io::Result<()> { + let out = std::path::PathBuf::from("src/generated"); + std::fs::create_dir_all(&out)?; + prost_build::Config::new() + .out_dir(&out) + .compile_protos(&["src/proto/example_op.proto"], &["src/proto"])?; + Ok(()) +} +``` + +Note: writing into `src/generated/` rather than `$OUT_DIR` is a deliberate deviation +from idiomatic prost. It lets `lib.rs` do +`include!(concat!("generated/", "comet.contrib.example.rs"))` with a stable filesystem +path — convenient for editor tooling. The file is gitignored. + +The contrib's `Cargo.toml` adds `prost-build` to `[build-dependencies]` and `prost` +to `[dependencies]`. + +### Proto, JVM side + +Comet's main build shades `com.google.protobuf` under `${comet.shade.packageName}.protobuf` +(see the root `pom.xml`'s `` property). The generated +`OperatorOuterClass.ContribOp` references the shaded package. Your contrib's +generated Java proto MUST therefore live under the same shade prefix at runtime, or +the dispatcher will refuse `setContribOp(...)` because `ByteString` / `Message` types +won't align. + +The simplest path is to add `protoc-jar-maven-plugin` to your contrib `pom.xml`, +generate Java classes during `generate-sources`, and rely on the parent pom's shading +plugin to relocate `com.google.protobuf` consistently: + +```xml + + + + com.github.os72 + protoc-jar-maven-plugin + ${protoc-jar-maven-plugin.version} + + + generate-sources + run + + com.google.protobuf:protoc:${protobuf.version} + + native/src/proto + + + + + + + +``` + +And depend on `protobuf-java` so the generated classes compile: + +```xml + + com.google.protobuf + protobuf-java + ${protobuf.version} + provided + +``` + +`provided` scope, not `compile` — the user's classpath already has the shaded +protobuf-java via `comet-spark`. + +`contrib/example/` does not exercise this path because its Scala side never builds a +`ContribOp` (the example's tests only validate dispatch wiring, not payload generation). +The first real-format contrib in the tree will be the place this section's snippets +are first exercised against CI. + +### Building a `ContribOp` envelope + +From your `CometOperatorSerde.convert`: + +```scala +import org.apache.comet.serde.OperatorOuterClass.{ContribOp, Operator} +import comet.contrib.myformat.{MyOpProto} // your generated Java proto + +val payload: MyOpProto = MyOpProto.newBuilder() + .setSomeField(scanState.someField) + .build() + +val envelope = ContribOp.newBuilder() + .setKind("myformat-scan") + .setPayload(payload.toByteString) + .build() + +Some(builder.setContribOp(envelope).build()) +``` + +The Rust generated field on the `Operator` enum is called `op_struct` (a `oneof`); the +Java builder method is `Operator.Builder.setContribOp(ContribOp)`. Both correspond to +the same wire-format field — the naming difference is purely the language conventions +of the code generators. ## Wire-format flow -1. The contrib's Scala code intercepts a `FileSourceScanExec` (or `BatchScanExec`) - matching its file format. -2. It builds a contrib-private proto message (the payload format is the contrib's - choice). -3. It wraps the payload bytes in `ContribOp(kind = "-", payload = - )` and sets that on the operator proto's `op_struct` field. +1. Your Scala code intercepts a `FileSourceScanExec` (or `BatchScanExec`) matching your + format, returning a `CometMyFormatScanExec` from `transformV1`/`transformV2`. +2. `CometExecRule` later picks up the `CometMyFormatScanExec` instance, finds your serde + via the class-keyed dispatch, and calls `serde.convert(op, builder, childOp...)`. +3. Your `convert` builds a contrib-private proto message (whatever fields you need), + serializes it, wraps in `ContribOp { kind, payload }`, and stuffs it into the + operator builder via `setContribOp`. 4. The proto is shipped through JNI to native. -5. Core's native planner sees `OpStruct::ContribOp`, looks up the planner by `kind`, - calls `planner.plan(payload, children)`. -6. The contrib's native crate decodes `payload` into its own proto type and returns an - `Arc`. +5. Core's native planner sees `OpStruct::ContribOp`, validates `kind` (non-empty, + under 16 MiB payload, registered), looks up the planner, calls + `planner.plan(ctx, payload, children)`. +6. Your native crate decodes `payload` into your own proto type and returns an + `Arc`. Use `ctx` to reach core's parquet builder, expression + planner, etc. (see the next section). 7. Core wraps the result in a `SparkPlan` and continues planning. +## Walking a real `plan()` body + +The example contrib's planners return `EmptyExec` — none of the `ContribPlannerContext` +methods are exercised. A file-scan contrib's `plan()` typically threads through all of +them: + +```rust +use std::sync::Arc; +use comet_contrib_spi::{ + ContribError, ContribOperatorPlanner, ContribPlannerContext, ParquetDatasourceParams, +}; +use datafusion::physical_plan::ExecutionPlan; +use prost::Message; + +use crate::proto::MyFormatScan; + +pub struct MyFormatScanPlanner; + +impl ContribOperatorPlanner for MyFormatScanPlanner { + fn plan( + &self, + ctx: &dyn ContribPlannerContext, + payload: &[u8], + _children: Vec>, + ) -> Result, ContribError> { + // 1. Decode your contrib-private payload. + let scan = MyFormatScan::decode(payload) + .map_err(|e| ContribError::BadPayload(format!("decode MyFormatScan: {e}")))?; + + // 2. Translate the Spark proto schemas into Arrow schemas via core. + let required_schema = ctx.convert_spark_schema(&scan.required_schema); + let data_schema = ctx.convert_spark_schema(&scan.data_schema); + let partition_schema = ctx.convert_spark_schema(&scan.partition_schema); + + // 3. Lift Catalyst data-filter Exprs into PhysicalExprs core can execute. + let data_filters = scan + .data_filters + .iter() + .map(|e| ctx.build_physical_expr(e, required_schema.clone())) + .collect::, _>>()?; + + // 4. Register the object store. The returned URL is what every PartitionedFile + // in your file_groups must use; the returned Path is the canonical key + // inside that store, usually the per-file path the contrib uses to set + // `partitioned_file.object_meta.location`. + let any_file_url = scan.tasks + .first() + .map(|t| t.file_path.clone()) + .ok_or_else(|| ContribError::Plan("empty file list".into()))?; + let object_store_options = scan.object_store_options.clone(); + let (object_store_url, _path_template) = + ctx.prepare_object_store(any_file_url, &object_store_options)?; + + // 5. Build the file_groups: Vec> with one inner Vec per + // desired DataFusion partition. + let file_groups = build_partitioned_files(&scan.tasks /* contrib's helper */)?; + + // 6. Hand the bundle to core's tuned ParquetSource. + let exec = ctx.build_parquet_datasource_exec( + ParquetDatasourceParams::new( + required_schema.clone(), + object_store_url, + file_groups, + ) + .with_data_schema(data_schema) + .with_partition_schema(partition_schema) + .with_data_filters(data_filters) + .with_session_timezone(&scan.session_timezone) + .with_case_sensitive(scan.case_sensitive), + )?; + + // 7. Optionally wrap the parquet exec in contrib-specific operators + // (e.g. a Delta DV filter). + Ok(exec) + } +} +``` + +The flow above mirrors what a real Delta or Iceberg port does. Pieces a contrib +typically owns inside itself, NOT exposed through `ContribPlannerContext`: + +- Reading the format's transaction log / manifest (kernel-rs for Delta, iceberg-rust + for Iceberg). +- Resolving file paths to absolute URLs on the driver. +- Computing per-file deletion-vector / equality-delete row indexes. +- Wrapping the parquet exec in a per-row-filter operator if the format needs it. + +Use `ctx` for things that already exist inside core (object-store registry, parquet +plumbing, expression planner); reimplement the format-specific parts in your contrib. + ## `#[ctor]` registration: panic safety + logging The contrib's native crate registers its planners during library init via -`#[ctor::ctor]`. Two important quirks to get right: +`#[ctor::ctor]`. Three quirks to get right: **Panics in `#[ctor]` abort the JVM process** before `JNI_OnLoad` runs, with no -diagnostic on macOS/Linux. Wrap every ctor body in `std::panic::catch_unwind` and emit -a stderr message on failure: +diagnostic on macOS/Linux. Wrap every ctor body in `std::panic::catch_unwind` and +emit a stderr message on failure: ```rust #[ctor::ctor] @@ -231,85 +578,175 @@ fn register() { **`log::*!` macros inside `#[ctor]` are no-ops.** Comet's logger is initialised later, in `Java_org_apache_comet_NativeBase_init`. Any diagnostic you need from the ctor body -must go through `eprintln!`. The example contrib follows both patterns. +must go through `eprintln!`. **Cross-platform caveats.** `#[ctor::ctor]` works on Linux / macOS / Windows MSVC, but the order of ctor execution across rlibs is link-order dependent and not guaranteed across compiler versions. Your contrib's ctor **MUST NOT** depend on another contrib already being registered. -## Cargo feature gate +The corresponding JVM rule: **do not call `CometExtensionRegistry.load()` from a +class's static initializer** (Scala `object` init, or a JVM-level static block). Scala +monitors are reentrant so it won't deadlock, but re-entry would observe the partially- +built state and shadow the in-flight publication. + +## Logging conventions + +- **From the contrib's Scala code**: use `org.slf4j.Logger` / Comet's `Logging` trait. + Lifetime-event logs (extension discovered, contrib registered) at INFO; per-plan + decisions at DEBUG; correctness violations at WARN. +- **From the contrib's Rust `#[ctor]`**: `eprintln!` only (logger not yet initialised). +- **From the contrib's Rust `plan()` body and runtime code**: `log::*` macros. Choose a + `target:` matching your crate name so users can filter: + `log::debug!(target: "comet::contrib::myname", "built plan with {n} files")`. +- **Error context**: pre-format error messages with enough context that the dispatcher's + `contrib planner "myname-scan": ` wrapper reads sensibly. Do not + re-prefix with your `kind`. + +## Diagnosing a misconfigured contrib + +The most common first-hour problem is "I packaged my JAR and it does nothing." Three +signals to check: + +- `CometExtensionRegistry` logs at INFO. When discovery runs and finds zero entries, + it emits: + ``` + Comet contrib extensions: none discovered on classpath + (no META-INF/services entries for CometScanRuleExtension or + CometOperatorSerdeExtension) + ``` + Confirm your JAR ships the `META-INF/services/...CometScanRuleExtension` file with + the correct fully-qualified extension class on its own line. +- ServiceLoader instantiation failures are logged at WARN with `Failed to load a + CometScanRuleExtension entry; skipping`. Causes: missing no-arg constructor on the + extension class, exception thrown by the constructor. +- `registered_contrib_kinds()` (Rust) returns the kinds currently registered. If your + contrib's kind is missing under a build that should include it, the Cargo feature is + off or the `extern crate` in `native/core/src/lib.rs` is missing. + +Set the logger for `org.apache.comet.spi.CometExtensionRegistry` to INFO/WARN to surface +both messages. + +### Classloader interaction + +`CometExtensionRegistry.load()` uses `Thread.currentThread().getContextClassLoader()` +first, with `getClass.getClassLoader` as fallback. Either should see Comet and the +contrib JAR in typical Spark deploy modes (`--jars`, `--packages`, application +classpath). Discovery is **lazy** — triggered the first time `CometScanRule._apply` or +`CometExecRule._apply` runs against a Comet-enabled session. By that point all +`--jars`-injected JARs are on the classpath, so order-of-arrival inside the driver +JVM is not a concern. + +## Maven JAR packaging + version pinning + +The example contrib ships a thin JAR with no shading. Real contribs SHOULD prefer thin +JARs too. If your contrib must include a third-party library that conflicts with the +user's classpath, shade the conflicting classes under your contrib's package prefix +(`org.apache.comet.contrib..shaded.*`) so classloader collisions stay local. +Do **not** shade `comet-spark` or its transitive dependencies — those are `provided` +scope and the user supplies them. + +`comet-spark`'s shading of `com.google.protobuf` is the one external dep that does +need attention: generated Java classes from your `.proto` reference the shaded +package, which is handled automatically when you use the parent pom's plugin +configuration (the contrib pom inherits the same `` property). + +### Version pinning + +`comet-spark` is `provided` in your contrib's pom. Pin the dependency to +the exact Comet patch version your contrib was tested against: + +```xml + + org.apache.datafusion + comet-spark-spark${spark.version.short}_${scala.binary.version} + 0.17.0 + provided + +``` -Each contrib's native rlib is wired into core via a feature flag. Build core with: +In-tree contribs use `${project.version}`; out-of-tree contribs use the explicit Comet +version they were built against. A contrib built against Comet `0.17.x` is not +guaranteed runtime-compatible with Comet `0.18.x` — the SPI is alpha. -```bash -# Default release build: zero contrib surface. registered_contrib_kinds() is empty. -cargo build +### Multi-Spark-version support -# Enable a specific contrib explicitly: -cargo build --features contrib-example -# or -cargo build --features contrib-example,contrib-delta +Comet itself ships a per-Spark-minor-version artifact via the +`spark.version.short` Maven profile (`3.4`, `3.5`, `4.0`). Your contrib follows the +same model: -# Verify the slim build path: -cargo build --no-default-features -``` +- Pick the matching Spark profile when building (`-Dspark.version.short=3.5`). +- The resulting artifact ID encodes the Spark version + (`comet-contrib--spark3.5_2.13`). +- If your contrib must support multiple Spark minor versions, publish one artifact per + profile, mirroring Comet. Shim code that differs across Spark versions belongs under + `src/main/scala-${shims.majorVerSrc}/` (see Comet's `common/`/`spark/` modules for + the existing pattern). -`registered_contrib_kinds()` in a default release build is empty — production -deployments only see the contribs they explicitly opted into. CI matrix should include -a `--no-default-features` row to catch any accidental contrib leakage into core. +## Testing -The JVM side is **always** conditional: the contrib JAR is its own artifact, and Spark -only picks it up when it's on the classpath. Even with the Cargo feature on, a user -who doesn't add the contrib JAR sees no behaviour change — the contrib's native planner -sits dormant in the registry, waiting for a JVM serde that never calls it. +`contrib/example/` demonstrates the JVM-side test pattern: -## Maven JAR packaging +- A unit test that calls `CometExtensionRegistry.resetForTesting()` and `load()`, + then asserts the contrib's extension is discovered via ServiceLoader. Catches + packaging mistakes (missing `META-INF/services`, wrong class name). +- Per-method unit tests for the extension's `matches*` / `transform*` logic. -The example contrib ships a thin JAR (no shading). Real contribs SHOULD prefer thin -JARs too. If your contrib must include a third-party library that conflicts with core's -classpath (e.g., a different protobuf-java version), shade the conflicting classes -under your contrib's package prefix (`org.apache.comet.contrib..shaded.*`) so -classloader collisions stay local. Do not shade `comet-spark` or its transitive -dependencies — those are `provided` scope and the user supplies them. +For native unit tests of a `ContribOperatorPlanner`, use `ScopedContribPlannerRegistration` +from `comet-contrib-spi` to install and tear down planners without polluting the +global registry: -## Registry implementation note +```rust +use comet_contrib_spi::ScopedContribPlannerRegistration; + +#[test] +fn my_planner_round_trip() { + let _guard = ScopedContribPlannerRegistration::new( + "myformat-scan", + Arc::new(MyFormatScanPlanner), + ); + // ... exercise dispatch ... +} +``` -The native contrib planner registry is currently a `RwLock>>`. -Lookups happen once per `ContribOp` plan call; writes happen only during library init. -The implementation may switch to a lock-free primitive (`ArcSwap`) in a future release -if profiling shows the read path matters; the public API stays unchanged either way. +Pair with `#[serial_test::serial]` if your test asserts on `registered_contrib_kinds()` +(which other tests' guards may be temporarily mutating in parallel). -## Payload size cap +### End-to-end (Rust + Scala round-trip) -The native dispatcher enforces a hard ceiling of **16 MiB** on `ContribOp.payload`. A -malformed JVM-side serde (or one that accidentally accumulates state across plan calls) -producing a larger payload is rejected with a clear error message before the contrib's -`plan()` runs. The cap is intentionally above any plausible file-scan payload (Delta -with ~100k tasks weighs in around 3–4 MiB) and well below "heap pressure" territory; -the value is hardcoded in `native/core/src/execution/planner.rs`. If your contrib has -a legitimate need for a larger payload, file an issue with the size you need and the -use case -- the cap is a guardrail, not a feature. +A full integration test wires the Spark plan through real JNI and asserts the contrib's +native planner ran: -## Testing +1. Build a `SparkSession` configured with `spark.sql.extensions = + org.apache.comet.CometSparkSessionExtensions` and the contrib JAR on the classpath + (sbt: `Test/unmanagedClasspath`; Maven: the contrib's own test scope already has it). +2. Submit a query that hits your format's table reader. +3. Inspect the produced physical plan for your contrib's exec class + (`plan.exists(_.isInstanceOf[CometMyFormatScanExec])`). +4. Run the plan and assert against the result (e.g., a row count that only your native + planner could produce, distinguishable from a Spark fall-back). -`contrib/example/`'s test suite demonstrates the recommended pattern: +The example contrib's test fixture doubles as smoke coverage for the SPI dispatch path +itself (kind lookup, payload decode, error wrapping) under Comet's own CI when the +`contrib-example` feature is enabled. -- A unit test that calls `CometExtensionRegistry.load()` and asserts the contrib's - extension is discovered. This catches packaging mistakes (missing `META-INF/services`, - wrong class name, etc.). -- Per-method unit tests for the extension's `matches*` and `transform*` logic. +## Payload size cap -For a contrib with a real native operator, additionally write an integration test that: +The native dispatcher enforces a hard ceiling of **16 MiB** on `ContribOp.payload` +(`MAX_CONTRIB_PAYLOAD_BYTES` in `native/core/src/execution/planner.rs`). A malformed +JVM-side serde (or one that accidentally accumulates state across plan calls) +producing a larger payload is rejected with a clear error message before the contrib's +`plan()` runs. The cap is comfortably above any plausible file-scan payload (Delta +with ~100k tasks weighs in around 3–4 MiB) and well below "heap pressure" territory. +If your contrib has a legitimate need for a higher ceiling, file an issue with the +size you need and the use case — the cap is a guardrail, not a feature. -- Builds a `ContribOp` payload Scala-side. -- Submits the plan through a real `SparkSession` configured with the contrib JAR on the - classpath. -- Asserts the contrib's native planner was reached (typically by checking against a - result the no-op planner would not produce). +## Registry implementation note -Core's own regression suite for the SPI dispatch path uses the example contrib as its -test fixture, so PR1's CI doubles as smoke coverage for any future contribs. +The native contrib planner registry is currently a `RwLock>>`. +Lookups happen once per `ContribOp` plan call; writes happen only during library init. +The implementation may switch to a lock-free primitive (`ArcSwap`) in a future release +if profiling shows the read path matters; the public API stays unchanged either way. ## See also @@ -317,3 +754,5 @@ test fixture, so PR1's CI doubles as smoke coverage for any future contribs. the worked reference. - [`native/contrib-spi/`](https://github.com/apache/datafusion-comet/tree/main/native/contrib-spi) — the leaf SPI crate. +- [`spark/src/main/scala/org/apache/comet/spi/`](https://github.com/apache/datafusion-comet/tree/main/spark/src/main/scala/org/apache/comet/spi) — + the JVM SPI traits. From 2c46552c558fad57384f144d8f60310443d73adb Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 14:35:28 -0400 Subject: [PATCH 16/27] docs(contrib): second-pass review fixes for contributor guide Addresses the validation findings against commit 91c40e0a: Blockers - I4: JVM proto-shading recipe rewritten. The original claim that the contrib pom would inherit shading from the parent was wrong -- shading is configured in spark/pom.xml as a per-module execution, not via pluginManagement. A contrib generating its own Java proto without its own shade-plugin execution would NoSuchMethodError on setPayload() at runtime because ContribOp.Builder expects ${comet.shade.packageName}.protobuf.ByteString. New section gives the full pom snippet: protoc-jar-maven-plugin + maven-shade-plugin execution that relocates com.google.protobuf to the parent's shade prefix. - R3: with_session_timezone(&scan.session_timezone) didn't compile (&String doesn't impl Into). Fixed to use scan.session_timezone.as_str() with a brief inline comment explaining why. - R2/M1: build_partitioned_files was hand-waved with no shape. Added a full sketched implementation that builds PartitionedFile per task, resolves URLs to object_store::path::Path, sets object_meta.location, and notes the common real-world variations (file-range splitting, partition_values, format-specific filter wrappers). Other fixes - I6: gitignore guidance corrected -- the entry lives in the repo-root .gitignore, not in contrib/example/. Verified the actual entry exists. - I2: build.rs snippet now mirrors contrib/example/native/build.rs exactly (including the cargo:rerun-if-changed=src/proto/ line that prost-build needs to rebuild on schema changes). - I5: op_struct vs contrib_op naming clarified -- op_struct is the oneof name (Rust pattern-match handle), contrib_op is the field name on that oneof (Java setter name). They are not "the same field" with different names; they're a oneof and one of its members. - M3: ServiceLoader-diagnostics section now also covers detectDuplicateSerdeClasses (cross-contrib serde key collision) and register_contrib_planner's last-write-wins WARN on duplicate kinds. - cfg(not(any(...))) placeholder example replaced with the literal current form, plus the explicit "add feature = "contrib-" here" instruction. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../contributor-guide/contrib-extensions.md | 215 +++++++++++++----- 1 file changed, 160 insertions(+), 55 deletions(-) diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index c5183f8730..fe6a6fa755 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -83,11 +83,14 @@ You need: ### `.gitignore` -The generated proto outputs are checked in nowhere: +The generated proto outputs are checked in nowhere. Add a line to the repo-root +`.gitignore` mirroring the existing `contrib/example/native/src/generated` entry: -- `contrib//native/src/generated/` — Rust prost output. The example contrib's - `.gitignore` entry is the template. -- `contrib//target/` — Maven build output (inherits from the repo-root `.gitignore`). +``` +contrib//native/src/generated +``` + +`contrib//target/` is already gitignored by the repo-root pattern. ### Workspace placement constraint @@ -140,11 +143,12 @@ cargo build --features 'contrib-example contrib-' cargo build --no-default-features ``` -A core test under `#[cfg(not(any(feature = "contrib-example", ...)))]` asserts +A core test under `#[cfg(not(any(feature = "contrib-example")))]` (today's form; +the `any(...)` will list every contrib feature once more are added) asserts `registered_contrib_kinds()` is empty in the slim build. When you add a new -`contrib-` feature, **extend that test's `cfg` predicate** (see +`contrib-` feature, **extend that `cfg` predicate** (see `native/core/src/execution/planner/contrib.rs`'s `production_build_has_no_contrib_planners_registered`) -so the canary still compiles on your contrib's CI row. +to add `feature = "contrib-"` so the canary still compiles on your contrib's CI row. The JVM side is **always** conditional: the contrib JAR is its own Maven artifact, and Spark only loads it when it's on the classpath. Even with the Cargo feature on, a user @@ -348,12 +352,19 @@ to `.gitignore`. `contrib/example/native/build.rs` is the template: ```rust -fn main() -> std::io::Result<()> { - let out = std::path::PathBuf::from("src/generated"); - std::fs::create_dir_all(&out)?; +use std::{fs, io::Result, path::Path}; + +fn main() -> Result<()> { + // rerun-if-changed so cargo rebuilds when you edit your .proto during dev. + println!("cargo:rerun-if-changed=src/proto/"); + + let out_dir = "src/generated"; + if !Path::new(out_dir).is_dir() { + fs::create_dir(out_dir)?; + } prost_build::Config::new() - .out_dir(&out) - .compile_protos(&["src/proto/example_op.proto"], &["src/proto"])?; + .out_dir(out_dir) + .compile_protos(&["src/proto/.proto"], &["src/proto"])?; Ok(()) } ``` @@ -366,20 +377,37 @@ path — convenient for editor tooling. The file is gitignored. The contrib's `Cargo.toml` adds `prost-build` to `[build-dependencies]` and `prost` to `[dependencies]`. -### Proto, JVM side +### Proto, JVM side — handling Comet's protobuf shade + +This is the single trickiest piece of the JVM build. **Read carefully.** -Comet's main build shades `com.google.protobuf` under `${comet.shade.packageName}.protobuf` -(see the root `pom.xml`'s `` property). The generated -`OperatorOuterClass.ContribOp` references the shaded package. Your contrib's -generated Java proto MUST therefore live under the same shade prefix at runtime, or -the dispatcher will refuse `setContribOp(...)` because `ByteString` / `Message` types -won't align. +`comet-spark` shades `com.google.protobuf` under `${comet.shade.packageName}.protobuf` +(value: `org.apache.comet.shaded.protobuf`). The shading is applied in `spark/pom.xml`'s +`maven-shade-plugin` execution — it is **NOT inherited** by other modules through +`pluginManagement`. So when `OperatorOuterClass.ContribOp.Builder` is compiled into +the published `comet-spark.jar`, its `setPayload(ByteString)` signature references the +shaded type `org.apache.comet.shaded.protobuf.ByteString`. A contrib JAR that ships +unshaded `com.google.protobuf.ByteString` references (the default output of +`protoc-jar-maven-plugin`) will fail at runtime with `NoSuchMethodError` the first time +it calls `setPayload(myMessage.toByteString())`. -The simplest path is to add `protoc-jar-maven-plugin` to your contrib `pom.xml`, -generate Java classes during `generate-sources`, and rely on the parent pom's shading -plugin to relocate `com.google.protobuf` consistently: +The contrib pom must therefore: + +1. Generate Java proto classes via `protoc-jar-maven-plugin`. +2. Run its own `maven-shade-plugin` execution that relocates the same package the + parent declares (`${comet.shade.packageName}.protobuf`), so the contrib's generated + `ByteString` / `Message` references match the shaded comet-spark surface at runtime. ```xml + + + com.google.protobuf + protobuf-java + ${protobuf.version} + + + + @@ -399,28 +427,43 @@ plugin to relocate `com.google.protobuf` consistently: + + org.apache.maven.plugins + maven-shade-plugin + + + package + shade + + false + true + + + com.google.protobuf:protobuf-java + + + + + com.google.protobuf + ${comet.shade.packageName}.protobuf + + + + + + ``` -And depend on `protobuf-java` so the generated classes compile: - -```xml - - com.google.protobuf - protobuf-java - ${protobuf.version} - provided - -``` - -`provided` scope, not `compile` — the user's classpath already has the shaded -protobuf-java via `comet-spark`. +The relocation pattern MUST be `${comet.shade.packageName}.protobuf` (matching the +parent pom's property) — if you hardcode `org.apache.comet.shaded.protobuf` it works +today but breaks the moment Comet's build renames the shade prefix. -`contrib/example/` does not exercise this path because its Scala side never builds a -`ContribOp` (the example's tests only validate dispatch wiring, not payload generation). -The first real-format contrib in the tree will be the place this section's snippets -are first exercised against CI. +`contrib/example/` does NOT exercise this path because its Scala side never builds a +`ContribOp` — the example only validates dispatch wiring. The first real-format +contrib in the tree will be where this section's snippets are first exercised +end-to-end against CI. ### Building a `ContribOp` envelope @@ -442,10 +485,16 @@ val envelope = ContribOp.newBuilder() Some(builder.setContribOp(envelope).build()) ``` -The Rust generated field on the `Operator` enum is called `op_struct` (a `oneof`); the -Java builder method is `Operator.Builder.setContribOp(ContribOp)`. Both correspond to -the same wire-format field — the naming difference is purely the language conventions -of the code generators. +A note on the proto naming. `operator.proto` declares +`oneof op_struct { ... ContribOp contrib_op = 117; ... }`. So `op_struct` is the +*oneof name* and `contrib_op` is the *field name* on that oneof. The two code +generators surface this differently: + +- **Rust (prost):** pattern-matches as `match operator.op_struct { Some(OpStruct::ContribOp(c)) => ... }`. +- **Java (protoc):** uses the field-name-derived builder method `Operator.Builder.setContribOp(ContribOp)`. + +Both manipulate the same wire-format slot — the difference is purely how the code +generators expose `oneof` membership. ## Wire-format flow @@ -506,23 +555,28 @@ impl ContribOperatorPlanner for MyFormatScanPlanner { .map(|e| ctx.build_physical_expr(e, required_schema.clone())) .collect::, _>>()?; - // 4. Register the object store. The returned URL is what every PartitionedFile - // in your file_groups must use; the returned Path is the canonical key - // inside that store, usually the per-file path the contrib uses to set - // `partitioned_file.object_meta.location`. + // 4. Register the object store for the scheme + host the files live in. The + // returned ObjectStoreUrl is the canonical key every PartitionedFile in your + // file_groups must reference. The returned Path is only relevant if you are + // constructing PartitionedFiles whose location is rooted at the same prefix; + // most file-scan contribs build per-file Paths from the raw URL inside + // `build_partitioned_files` below and can discard this Path entirely. let any_file_url = scan.tasks .first() .map(|t| t.file_path.clone()) .ok_or_else(|| ContribError::Plan("empty file list".into()))?; let object_store_options = scan.object_store_options.clone(); - let (object_store_url, _path_template) = + let (object_store_url, _root_path) = ctx.prepare_object_store(any_file_url, &object_store_options)?; - // 5. Build the file_groups: Vec> with one inner Vec per - // desired DataFusion partition. - let file_groups = build_partitioned_files(&scan.tasks /* contrib's helper */)?; + // 5. Build the file_groups: Vec>, one inner Vec per + // desired DataFusion partition. The contrib owns this -- see the helper + // sketch below. + let file_groups = build_partitioned_files(&scan.tasks)?; - // 6. Hand the bundle to core's tuned ParquetSource. + // 6. Hand the bundle to core's tuned ParquetSource. as_str() because + // with_session_timezone takes `impl Into` and `&String` doesn't + // impl that; `&str` does. let exec = ctx.build_parquet_datasource_exec( ParquetDatasourceParams::new( required_schema.clone(), @@ -532,7 +586,7 @@ impl ContribOperatorPlanner for MyFormatScanPlanner { .with_data_schema(data_schema) .with_partition_schema(partition_schema) .with_data_filters(data_filters) - .with_session_timezone(&scan.session_timezone) + .with_session_timezone(scan.session_timezone.as_str()) .with_case_sensitive(scan.case_sensitive), )?; @@ -543,8 +597,51 @@ impl ContribOperatorPlanner for MyFormatScanPlanner { } ``` -The flow above mirrors what a real Delta or Iceberg port does. Pieces a contrib -typically owns inside itself, NOT exposed through `ContribPlannerContext`: +### `build_partitioned_files` — contrib-owned helper sketch + +`Vec>` is the format `init_datasource_exec` consumes. Each inner +`Vec` becomes one DataFusion partition; each `PartitionedFile` carries an +`ObjectMeta.location` (a path inside the registered object store) plus optional +partition-column values. Minimal one-file-per-partition implementation: + +```rust +use datafusion::datasource::listing::PartitionedFile; +use object_store::path::Path; +use url::Url; + +fn build_partitioned_files( + tasks: &[crate::proto::FileTask], +) -> Result>, ContribError> { + let mut groups = Vec::with_capacity(tasks.len()); + for task in tasks { + let url = Url::parse(&task.file_path) + .map_err(|e| ContribError::Plan(format!("invalid file URL: {e}")))?; + // Path within the object store -- starts at the bucket root for s3://, + // at the filesystem root for file://, etc. + let path = Path::from_url_path(url.path()) + .map_err(|e| ContribError::Plan(format!("path from URL: {e}")))?; + let mut pf = PartitionedFile::new(String::new(), task.file_size); + pf.object_meta.location = path; + // pf.partition_values = vec![/* ScalarValues per partition column */]; + groups.push(vec![pf]); + } + Ok(groups) +} +``` + +Real-world contribs typically: + +- Combine many small non-partitioned files into a single inner `Vec` (fewer + DataFusion partitions) and split very large files across multiple partitions with + `PartitionedFile::new_with_range`. +- Populate `partition_values` from the format's metadata so partition pruning works. +- Apply format-specific filters (e.g., Delta's pre-materialized deleted-row indexes, + Iceberg's equality deletes) as wrappers around the parquet exec, NOT as + PartitionedFile mutations. + +### Pieces a contrib owns inside itself + +Not exposed through `ContribPlannerContext`: - Reading the format's transaction log / manifest (kernel-rs for Delta, iceberg-rust for Iceberg). @@ -620,6 +717,14 @@ signals to check: - ServiceLoader instantiation failures are logged at WARN with `Failed to load a CometScanRuleExtension entry; skipping`. Causes: missing no-arg constructor on the extension class, exception thrown by the constructor. +- Duplicate-class collisions across contribs are logged at WARN with + `Multiple Comet contrib extensions claim the same exec class ...`. The merged + `CometExecRule` dispatch is last-write-wins on collision; if your contrib's serde + silently stops working when another contrib JAR is present, this is the line to + look for. +- `register_contrib_planner` is last-write-wins on duplicate `kind`. Registration + logs a WARN: `replacing existing planner for kind=...`. Two contribs that both + register `kind="delta-scan"` (the second clobbers the first) will surface here. - `registered_contrib_kinds()` (Rust) returns the kinds currently registered. If your contrib's kind is missing under a build that should include it, the Cargo feature is off or the `extern crate` in `native/core/src/lib.rs` is missing. From cf5253ed12cdf6c42f7f95d8deb3422a3063329d Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 22:04:23 -0400 Subject: [PATCH 17/27] refactor(contrib): bundle JVM half into comet-spark, matching native model The original SPI design had asymmetric distribution: the native rlib was compiled INTO libcomet via a Cargo feature, but the JVM half shipped as a separate Maven artifact discovered at runtime via ServiceLoader. This made the protobuf-shading recipe in the contributor guide load-bearing (~70 lines of XML to relocate `com.google.protobuf` per contrib) and it gave users a distribution model that didn't actually work -- the native side required a Comet rebuild regardless of how the JVM half shipped. The fix mirrors the native side: contribs are now source directories under contrib//, NOT Maven modules. Activating `-Pcontrib-` on spark/pom.xml folds the contrib's Scala + resources + proto into comet-spark.jar's normal compile + shade execution. `mvn install` produces a vanilla comet-spark.jar with zero contribs; `mvn install -Pcontrib-example` produces one with the example contrib's classes inside. Same shape as `cargo build --features contrib-example`. Files touched - spark/pom.xml: new contrib-example profile using build-helper-maven-plugin (source roots), maven-resources-plugin (META-INF/services), and an extra protoc-jar-maven-plugin execution (Java proto generation). The default shade execution gains a ServicesResourceTransformer so contrib service files merge cleanly. - pom.xml: contrib/example removed; contribs aren't modules. - contrib/example/pom.xml: deleted. The example is now Scala + resources + Cargo crate, no Maven pom. - spark/.../spi/CometExtensionRegistry.scala: docstring rewritten to describe the bundled model; no logic change. - docs/source/contributor-guide/contrib-extensions.md: rewritten "Architecture at a glance", "Required files", "Wiring into core", "Build matrix", "Proto, JVM side", and "Maven packaging" sections to reflect the new model. The protobuf-shading recipe is gone -- shading is handled by comet-spark's existing shade execution automatically. What this fixes - No Maven cycle (the previous separate-module design hit one and required a dedicated SPI module to break it; the source-injection model avoids the cycle entirely). - One artifact installed: `comet-spark-with-.jar` rather than a JAR + per-contrib JARs. - ~70 lines of protobuf-shading boilerplate removed from the contributor guide. The new "Proto, JVM side" section is ~15 lines. - Distribution model is honest: contribs are build-time options on Comet, JVM and native both. Verified: spark/pom.xml parses; `-Pcontrib-example` profile activates cleanly with no Maven reactor errors. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/pom.xml | 126 -------- .../contributor-guide/contrib-extensions.md | 270 ++++++++++-------- pom.xml | 12 +- spark/pom.xml | 113 ++++++++ .../comet/spi/CometExtensionRegistry.scala | 38 +-- 5 files changed, 291 insertions(+), 268 deletions(-) delete mode 100644 contrib/example/pom.xml diff --git a/contrib/example/pom.xml b/contrib/example/pom.xml deleted file mode 100644 index 99b8f3f12a..0000000000 --- a/contrib/example/pom.xml +++ /dev/null @@ -1,126 +0,0 @@ - - - - - - - 4.0.0 - - org.apache.datafusion - comet-parent-spark${spark.version.short}_${scala.binary.version} - 0.17.0-SNAPSHOT - ../../pom.xml - - - - comet-contrib-example-spark${spark.version.short}_${scala.binary.version} - comet-contrib-example - - - - false - - - - - - org.apache.maven.plugins - maven-enforcer-plugin - - - - no-duplicate-declared-dependencies - - enforce - - - true - - - - - - - - - - - org.apache.datafusion - comet-spark-spark${spark.version.short}_${scala.binary.version} - ${project.version} - provided - - - - - - org.apache.spark - spark-sql_${scala.binary.version} - test - - - org.scala-lang - scala-library - test - - - - org.scalatest - scalatest_${scala.binary.version} - test - - - org.scalatestplus - junit-4-13_${scala.binary.version} - test - - - diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index fe6a6fa755..032a17c4a3 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -30,18 +30,25 @@ and walks through every integration point that the example does not exercise. ## Architecture at a glance -Each contrib has two halves that ship as separate artifacts but are wired together at -build time: - -- **JVM half** — a separate Maven JAR - (`comet-contrib--spark${spark.version.short}_${scala.binary.version}`) containing - Scala/Java extension classes plus contrib-private generated proto classes. Discovered - at runtime via `java.util.ServiceLoader` from the contrib JAR's `META-INF/services/` - entries. -- **Native half** — a Rust `rlib` crate (NOT `cdylib`) that is **linked INTO core's - `libcomet`** at build time when the matching Cargo feature on core is enabled. There - is exactly one Comet native library at runtime; the contrib's `#[ctor]` registers its - operator planners during library load. +A contrib has two halves, both **bundled into Comet's published artifacts at build +time** when their matching flags are enabled. Nothing about a contrib is independently +distributable — the contrib lives inside Comet's release. + +- **JVM half** — Scala/Java classes plus generated Java proto. Built as a Maven + submodule under `contrib//` and **shaded into `comet-spark.jar`** via the + `-Pcontrib-` Maven profile on `spark/pom.xml`. With no profile active, the + contrib's classes are not in the published JAR. The contrib's `META-INF/services/` + entries are bundled along with the classes; ServiceLoader at runtime then discovers + them from inside `comet-spark.jar` itself. +- **Native half** — a Rust `rlib` crate (NOT `cdylib`) **linked into `libcomet`** via + the matching `--features contrib-` Cargo flag on the core crate. The contrib's + `#[ctor]` registers its operator planners during library load. + +The two halves are symmetric: contribs are build-time options on Comet, JVM and +native both. `mvn install -Pcontrib-example && cargo build --features contrib-example` +produces a Comet build that includes the example contrib in both `comet-spark.jar` and +`libcomet`; a vanilla build of either side produces an artifact with zero contrib +surface. The wire format between JVM and native uses a single generic envelope on the operator proto, `ContribOp { kind, payload }`. Core's planner dispatches by `kind`; the contrib's @@ -50,25 +57,29 @@ writes into the proto. ## Required files (mirror `contrib/example/` exactly) +A contrib is a directory of sources, **not a Maven module**. No `pom.xml`. The contrib's +Scala/Java sources are pulled into `comet-spark`'s compile by a profile on +`spark/pom.xml`; the contrib's Rust sources are pulled into `libcomet` by a Cargo +feature on `native/core`. The directory layout: + ``` contrib// - pom.xml ← Maven module src/main/scala/org/apache/comet/contrib// .scala ← CometScanRuleExtension / CometOperatorSerdeExtension impl src/main/resources/META-INF/services/ org.apache.comet.spi.CometScanRuleExtension ← one line per extension class org.apache.comet.spi.CometOperatorSerdeExtension ← (only if you implement serdes) src/test/scala/org/apache/comet/contrib// - Suite.scala ← integration test + Suite.scala ← integration test (runs as part of comet-spark's tests when profile active) native/ Cargo.toml ← rlib crate, workspace = "../../../native" build.rs ← runs prost-build over your proto schema src/lib.rs ← ContribOperatorPlanner impl + #[ctor] registration - src/proto/.proto ← contrib-private proto schema, your own package + src/proto/.proto ← contrib-private proto schema (also used by JVM-side protoc generation) src/generated/ ← (gitignored) prost-build output ``` -Plus three edits to existing files (collected under "Wiring into core", below). +Plus a handful of build-config edits (collected under "Wiring into core", below). ### Prerequisites @@ -100,10 +111,86 @@ breaks the workspace lookup; place the contrib at the documented depth. ## Wiring into core -Three single-line edits to existing files: +Four edits, two per side: + +### JVM side + +1. **`spark/pom.xml`** — add a `contrib-` profile under ``. The + `contrib-example` profile is the copy-this template. The profile uses + `build-helper-maven-plugin` to add the contrib's source/test directories, + `maven-resources-plugin` to merge in `META-INF/services` entries, and + `protoc-jar-maven-plugin` to generate the contrib's Java protos: + + ```xml + + contrib- + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-contrib--source + generate-sources + add-source + + ../contrib//src/main/scala + + + + add-contrib--test-source + generate-test-sources + add-test-source + + ../contrib//src/test/scala + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-contrib--resources + process-resources + copy-resources + + ${project.build.outputDirectory} + + ../contrib//src/main/resources + + + + + + + com.github.os72 + protoc-jar-maven-plugin + + + generate-contrib--proto + generate-sources + run + + com.google.protobuf:protoc:${protobuf.version} + + ../contrib//native/src/proto + + + + + + + + + ``` + + No additions to the parent `pom.xml`'s `` — contribs are not Maven modules. + +### Native side -1. **Root `pom.xml`** — add `contrib/` under the existing - `` block so `mvn install` builds the contrib JAR. 2. **`native/Cargo.toml`** — add `../contrib//native` to the workspace `members` list (NOT `default-members` — contribs are consumed via core's feature flags). 3. **`native/core/Cargo.toml`** — add a `contrib-` feature gate and a matching @@ -118,8 +205,7 @@ Three single-line edits to existing files: ``` Do **not** add the feature to `default = [...]`. Production builds carry zero contrib - surface by design; users opt in explicitly. (CI matrix builds should add the feature.) - + surface by design; users opt in explicitly. 4. **`native/core/src/lib.rs`** — add the matching feature-gated `extern crate` so the contrib's `#[ctor]` is linked in when the feature is on: @@ -128,21 +214,33 @@ Three single-line edits to existing files: extern crate comet_contrib_; ``` -## Cargo feature gate +## Build matrix ```bash -# Default release build: zero contrib surface. registered_contrib_kinds() is empty. +# Vanilla Comet build: zero contribs on either side. +mvn install cargo build -# Enable a specific contrib explicitly: +# Build with the example contrib bundled into both halves. +mvn install -Pcontrib-example cargo build --features contrib-example -# Multiple at once: -cargo build --features 'contrib-example contrib-' -# Verify the slim build path: +# Multiple contribs at once. +mvn install -Pcontrib-example,contrib-delta +cargo build --features 'contrib-example contrib-delta' + +# Verify the slim native build path. cargo build --no-default-features ``` +The JVM and native flags MUST agree for a contrib to work. Activating only the Maven +profile gives you a `comet-spark.jar` whose serde produces `ContribOp` envelopes the +native side can't dispatch (you'll get +`No contrib planner registered for ContribOp.kind=...`). Activating only the Cargo +feature gives you a `libcomet` ready to dispatch a contrib whose serde isn't on the +classpath, so the registered planner sits dormant. The contributor guide and release +notes call out both flags together. + A core test under `#[cfg(not(any(feature = "contrib-example")))]` (today's form; the `any(...)` will list every contrib feature once more are added) asserts `registered_contrib_kinds()` is empty in the slim build. When you add a new @@ -150,20 +248,16 @@ the `any(...)` will list every contrib feature once more are added) asserts `native/core/src/execution/planner/contrib.rs`'s `production_build_has_no_contrib_planners_registered`) to add `feature = "contrib-"` so the canary still compiles on your contrib's CI row. -The JVM side is **always** conditional: the contrib JAR is its own Maven artifact, and -Spark only loads it when it's on the classpath. Even with the Cargo feature on, a user -who doesn't add the contrib JAR sees no behaviour change — the contrib's native planner -sits dormant in the registry, waiting for a JVM serde that never calls it. - ## SPI stability The contrib SPI is currently **alpha** — minor Comet versions may carry breaking -changes during the early-adopter period. Concretely: +changes during the early-adopter period. Because contribs ship in-tree (as part of +Comet's release), every Comet build is internally consistent — a `0.18.x` +`comet-spark.jar` is bundled with `0.18.x` contribs. Version-skew concerns +("contrib JAR built against 0.17, Comet runtime 0.18") don't apply. + +What stability guarantees the SPI does aim for: -- `comet-contrib-spi` is workspace-versioned alongside core. A contrib built against - Comet `0.17.x` is **not** guaranteed to work with Comet `0.18.x` at runtime; the SPI - traits may evolve. Pin your contrib's `` and `comet-spark` dependency to a - specific Comet patch version. - `ParquetDatasourceParams` and `ContribError` are `#[non_exhaustive]` so additive changes (new fields / variants) are minor bumps, not breaks. Use `ParquetDatasourceParams::new(...)` + `with_*` setters rather than struct-literal @@ -377,26 +471,11 @@ path — convenient for editor tooling. The file is gitignored. The contrib's `Cargo.toml` adds `prost-build` to `[build-dependencies]` and `prost` to `[dependencies]`. -### Proto, JVM side — handling Comet's protobuf shade - -This is the single trickiest piece of the JVM build. **Read carefully.** - -`comet-spark` shades `com.google.protobuf` under `${comet.shade.packageName}.protobuf` -(value: `org.apache.comet.shaded.protobuf`). The shading is applied in `spark/pom.xml`'s -`maven-shade-plugin` execution — it is **NOT inherited** by other modules through -`pluginManagement`. So when `OperatorOuterClass.ContribOp.Builder` is compiled into -the published `comet-spark.jar`, its `setPayload(ByteString)` signature references the -shaded type `org.apache.comet.shaded.protobuf.ByteString`. A contrib JAR that ships -unshaded `com.google.protobuf.ByteString` references (the default output of -`protoc-jar-maven-plugin`) will fail at runtime with `NoSuchMethodError` the first time -it calls `setPayload(myMessage.toByteString())`. +### Proto, JVM side -The contrib pom must therefore: - -1. Generate Java proto classes via `protoc-jar-maven-plugin`. -2. Run its own `maven-shade-plugin` execution that relocates the same package the - parent declares (`${comet.shade.packageName}.protobuf`), so the contrib's generated - `ByteString` / `Message` references match the shaded comet-spark surface at runtime. +Add `protoc-jar-maven-plugin` to your contrib `pom.xml`, pointing at your `.proto` +schema. Generated Java classes end up under `target/generated-sources/protobuf/java/` +and get compiled into the contrib's JAR by the inherited `scala-maven-plugin`: ```xml @@ -404,7 +483,6 @@ The contrib pom must therefore: com.google.protobuf protobuf-java ${protobuf.version} - @@ -427,43 +505,15 @@ The contrib pom must therefore: - - org.apache.maven.plugins - maven-shade-plugin - - - package - shade - - false - true - - - com.google.protobuf:protobuf-java - - - - - com.google.protobuf - ${comet.shade.packageName}.protobuf - - - - - - ``` -The relocation pattern MUST be `${comet.shade.packageName}.protobuf` (matching the -parent pom's property) — if you hardcode `org.apache.comet.shaded.protobuf` it works -today but breaks the moment Comet's build renames the shade prefix. - -`contrib/example/` does NOT exercise this path because its Scala side never builds a -`ContribOp` — the example only validates dispatch wiring. The first real-format -contrib in the tree will be where this section's snippets are first exercised -end-to-end against CI. +**Shading is handled automatically.** When the `contrib-` profile on +`spark/pom.xml` bundles your contrib into `comet-spark.jar`, the inherited shade +execution relocates `com.google.protobuf` to `${comet.shade.packageName}.protobuf` +across both your classes and `comet-spark`'s. Don't add your own `maven-shade-plugin` +execution to the contrib pom; that would shade twice and break the runtime types. ### Building a `ContribOp` envelope @@ -742,37 +792,19 @@ classpath). Discovery is **lazy** — triggered the first time `CometScanRule._a `--jars`-injected JARs are on the classpath, so order-of-arrival inside the driver JVM is not a concern. -## Maven JAR packaging + version pinning +## Maven packaging -The example contrib ships a thin JAR with no shading. Real contribs SHOULD prefer thin -JARs too. If your contrib must include a third-party library that conflicts with the -user's classpath, shade the conflicting classes under your contrib's package prefix -(`org.apache.comet.contrib..shaded.*`) so classloader collisions stay local. -Do **not** shade `comet-spark` or its transitive dependencies — those are `provided` -scope and the user supplies them. - -`comet-spark`'s shading of `com.google.protobuf` is the one external dep that does -need attention: generated Java classes from your `.proto` reference the shaded -package, which is handled automatically when you use the parent pom's plugin -configuration (the contrib pom inherits the same `` property). - -### Version pinning - -`comet-spark` is `provided` in your contrib's pom. Pin the dependency to -the exact Comet patch version your contrib was tested against: - -```xml - - org.apache.datafusion - comet-spark-spark${spark.version.short}_${scala.binary.version} - 0.17.0 - provided - -``` +Contribs are in-tree only — they ship as part of Comet's release. The contrib's +Maven module produces a standalone JAR (built unconditionally so the workspace stays +consistent), but the JAR is **not deployed**: `maven.deploy.skip=true` inherits from +the parent pom. The contrib's classes reach users through `comet-spark.jar`, which +bundles them via the `contrib-` profile on `spark/pom.xml`. -In-tree contribs use `${project.version}`; out-of-tree contribs use the explicit Comet -version they were built against. A contrib built against Comet `0.17.x` is not -guaranteed runtime-compatible with Comet `0.18.x` — the SPI is alpha. +If your contrib pulls in a third-party library, declare the dep in your contrib's pom +in `compile` scope (no `provided` — the contrib's classes go through the same shade +execution as core's, and any deps the contrib pulls need to be visible to that shade). +Avoid third-party deps where you can; the more your contrib drags in, the more +likely the shade hits a relocation collision with `comet-spark`'s own includes. ### Multi-Spark-version support diff --git a/pom.xml b/pom.xml index 685e474d59..5778508553 100644 --- a/pom.xml +++ b/pom.xml @@ -39,13 +39,13 @@ under the License. spark spark-integration - contrib/example diff --git a/spark/pom.xml b/spark/pom.xml index d3c18ccf87..98e9a8c6fe 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -351,6 +351,109 @@ under the License. + + + + contrib-example + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-contrib-example-source + generate-sources + add-source + + + ../contrib/example/src/main/scala + + + + + add-contrib-example-test-source + generate-test-sources + add-test-source + + + ../contrib/example/src/test/scala + + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-contrib-example-resources + process-resources + copy-resources + + ${project.build.outputDirectory} + + + ../contrib/example/src/main/resources + + + + + + + + + com.github.os72 + protoc-jar-maven-plugin + + + generate-contrib-example-proto + generate-sources + run + + com.google.protobuf:protoc:${protobuf.version} + + ../contrib/example/native/src/proto + + + + + + + + @@ -435,6 +538,16 @@ under the License. ${comet.shade.packageName}.guava.thirdparty + + + + diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index 66a8861e59..90de30c515 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -27,20 +27,23 @@ import scala.jdk.CollectionConverters._ import org.apache.spark.internal.Logging /** - * Process-wide singleton that discovers and exposes contrib extensions found on the classpath via - * `java.util.ServiceLoader`. + * Process-wide singleton that exposes the contrib extensions bundled into comet-spark.jar. * - * Discovery happens once per JVM, idempotent: the first `load()` call enumerates every - * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` and - * `META-INF/services/org.apache.comet.spi.CometOperatorSerdeExtension` resource on the Comet - * classloader. Subsequent calls are no-ops. + * Discovery uses `java.util.ServiceLoader` against `META-INF/services/` entries inside + * comet-spark.jar. Those entries get there at build time: each contrib (under `contrib//`) + * carries its own `META-INF/services/` files, and the `contrib-` Maven profile on + * spark/pom.xml shades the contrib's classes plus those service entries into the published + * comet-spark.jar. A vanilla `mvn install` produces a comet-spark.jar with zero contribs; a + * `mvn install -Pcontrib-example` build bundles the example contrib. The native side mirrors + * this exactly via `--features contrib-example` on the Rust core crate. * - * `load()` is invoked lazily from `CometScanRule._apply` and `CometExecRule._apply` the first - * time either rule runs against a Comet-enabled session. Spark sessions that never enable Comet - * pay zero ServiceLoader cost. + * Discovery is idempotent: the first `load()` call enumerates the service entries; subsequent + * calls are no-ops. `load()` is invoked lazily from `CometScanRule._apply` and + * `CometExecRule._apply` the first time either rule runs against a Comet-enabled session. + * Spark sessions that never enable Comet pay zero ServiceLoader cost. * - * Failures to instantiate individual extensions are logged but do NOT fail Comet startup -- a - * misconfigured contrib JAR shouldn't take down the whole Spark session. + * Failures to instantiate individual extensions are logged at WARN but do NOT fail Comet + * startup -- a misconfigured contrib shouldn't take down the whole Spark session. */ object CometExtensionRegistry extends Logging { @@ -82,13 +85,14 @@ object CometExtensionRegistry extends Logging { s"serde=[${newSerdeExts.map(_.name).mkString(", ")}]") detectDuplicateSerdeClasses(newSerdeExts) } else { - // Positive signal that discovery ran. Without this line a user with a misconfigured - // contrib JAR (missing META-INF/services, or the JAR not on any classloader Comet - // can see) gets no diagnostic and silently loses contrib functionality. + // Positive signal that discovery ran. Comet-spark.jar's contrib content depends on + // which `-Pcontrib-` Maven profiles were active at build time; this line is + // what tells a user whose contrib went missing whether to suspect their Comet build + // or their classpath. logInfo( - "Comet contrib extensions: none discovered on classpath " + - "(no META-INF/services entries for CometScanRuleExtension or " + - "CometOperatorSerdeExtension)") + "Comet contrib extensions: none discovered. comet-spark.jar was built " + + "without any contrib profiles enabled, or the contrib's META-INF/services " + + "entries were not bundled correctly.") } } From c7656fcc6241aa724deb5a9ff34ffe2f65c8b8e7 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Thu, 14 May 2026 22:11:51 -0400 Subject: [PATCH 18/27] refactor(contrib): deps-only pom per contrib + ArcSwap registry Q1 (external deps): the previous source-injection-only refactor lost the encapsulation needed for contribs that pull in external Maven deps like delta-spark. Reintroduce a per-contrib pom.xml but as a deps-only artifact (`pom`, no code, no JAR). The contrib pom enumerates external deps; spark/pom.xml's `contrib-` profile depends on it via `pom` to pull those deps transitively onto comet-spark's classpath. No reactor cycle: the deps pom has no `` on comet-spark; it's a leaf list of external deps. contrib/example/pom.xml is the template -- its `` block is empty (the example has no external deps) but the file demonstrates the pattern that a real Delta contrib would use to pull in delta-spark. Q2 (registry primitive): swap RwLock for ArcSwap in comet-contrib-spi's registry. Reads on the dispatch hot path drop from "acquire RwLock read guard + drop" to "atomic load + ref-count bump"; there was never any meaningful reason to make readers interact with a lock since writes happen only during library init (sequential, single-threaded #[ctor] calls). Public API unchanged; all 7 SPI tests still pass. ScopedContribPlannerRegistration and _clear_for_test reworked to use rcu / atomic store respectively. Audit of other concurrency / perf hot spots: no other meaningful issues found. Per-dispatch Arc::clone is already optimal (single atomic refcount bump). CometExtensionRegistry's `synchronized` load() runs once. CometExecRule's mergedSerdes lookup is O(1). The preTransform corruption guard is O(K * (P + S)) per plan with K typically 1-3 -- microseconds, real safety value, keep. Verified: cargo test -p comet-contrib-spi passes (7 tests); maven profile contrib-example activates cleanly without reactor cycle. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/example/pom.xml | 56 ++++++++ .../contributor-guide/contrib-extensions.md | 120 +++++++----------- native/Cargo.lock | 1 + native/contrib-spi/Cargo.toml | 3 + native/contrib-spi/src/lib.rs | 92 ++++++++------ pom.xml | 19 ++- spark/pom.xml | 15 +++ 7 files changed, 184 insertions(+), 122 deletions(-) create mode 100644 contrib/example/pom.xml diff --git a/contrib/example/pom.xml b/contrib/example/pom.xml new file mode 100644 index 0000000000..6f67ac682d --- /dev/null +++ b/contrib/example/pom.xml @@ -0,0 +1,56 @@ + + + + + + 4.0.0 + + org.apache.datafusion + comet-parent-spark${spark.version.short}_${scala.binary.version} + 0.17.0-SNAPSHOT + ../../pom.xml + + + + comet-contrib-example-deps-spark${spark.version.short}_${scala.binary.version} + comet-contrib-example-deps + pom + + + + + diff --git a/docs/source/contributor-guide/contrib-extensions.md b/docs/source/contributor-guide/contrib-extensions.md index 032a17c4a3..e119840b58 100644 --- a/docs/source/contributor-guide/contrib-extensions.md +++ b/docs/source/contributor-guide/contrib-extensions.md @@ -57,13 +57,17 @@ writes into the proto. ## Required files (mirror `contrib/example/` exactly) -A contrib is a directory of sources, **not a Maven module**. No `pom.xml`. The contrib's +A contrib is a directory of sources plus a deps-only Maven pom. The contrib's Scala/Java sources are pulled into `comet-spark`'s compile by a profile on `spark/pom.xml`; the contrib's Rust sources are pulled into `libcomet` by a Cargo -feature on `native/core`. The directory layout: +feature on `native/core`. The `pom.xml` exists solely to enumerate external Maven +deps (e.g., `io.delta:delta-spark` for a Delta contrib); it does NOT produce code +and does NOT depend on `comet-spark` (those two together would create a Maven +reactor cycle). ``` contrib// + pom.xml ← pom; declares external Maven deps only src/main/scala/org/apache/comet/contrib// .scala ← CometScanRuleExtension / CometOperatorSerdeExtension impl src/main/resources/META-INF/services/ @@ -79,6 +83,12 @@ contrib// src/generated/ ← (gitignored) prost-build output ``` +The `pom.xml` is a `pom` with one job: list the contrib's +external Maven deps. A Delta contrib's pom would carry `` entries for +`io.delta:delta-spark`. `spark/pom.xml`'s `contrib-` profile depends on this +deps-pom via `pom`, which transitively resolves the listed deps onto +comet-spark's classpath. + Plus a handful of build-config edits (collected under "Wiring into core", below). ### Prerequisites @@ -111,83 +121,39 @@ breaks the workspace lookup; place the contrib at the documented depth. ## Wiring into core -Four edits, two per side: +Five edits, three per side (JVM) + two (native): ### JVM side -1. **`spark/pom.xml`** — add a `contrib-` profile under ``. The - `contrib-example` profile is the copy-this template. The profile uses - `build-helper-maven-plugin` to add the contrib's source/test directories, - `maven-resources-plugin` to merge in `META-INF/services` entries, and - `protoc-jar-maven-plugin` to generate the contrib's Java protos: +1. **Root `pom.xml`** — add `contrib/` so Maven always builds + the contrib's deps-pom. The pom is tiny (no code, no JAR — just `pom`). +2. **`contrib//pom.xml`** — create a `pom` file enumerating + your external Maven deps. Copy `contrib/example/pom.xml` as the template; the + example's `` block is empty (no external deps needed). A Delta-style + contrib would add e.g.: ```xml - - contrib- - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-contrib--source - generate-sources - add-source - - ../contrib//src/main/scala - - - - add-contrib--test-source - generate-test-sources - add-test-source - - ../contrib//src/test/scala - - - - - - org.apache.maven.plugins - maven-resources-plugin - - - copy-contrib--resources - process-resources - copy-resources - - ${project.build.outputDirectory} - - ../contrib//src/main/resources - - - - - - - com.github.os72 - protoc-jar-maven-plugin - - - generate-contrib--proto - generate-sources - run - - com.google.protobuf:protoc:${protobuf.version} - - ../contrib//native/src/proto - - - - - - - - + + + io.delta + delta-spark_${scala.binary.version} + 3.3.2 + provided + + ``` - No additions to the parent `pom.xml`'s `` — contribs are not Maven modules. + Use `provided` for deps the user supplies on their Spark classpath; + `compile` if the contrib ships them itself (shaded into comet-spark + via the inherited shade execution). + +3. **`spark/pom.xml`** — add a `contrib-` profile under ``. Copy the + `contrib-example` profile as the template. The profile (a) depends on the contrib's + deps-pom via `pom`, (b) uses `build-helper-maven-plugin` to add the + contrib's source/test directories, (c) uses `maven-resources-plugin` to merge in + `META-INF/services` entries, and (d) uses `protoc-jar-maven-plugin` to generate + the contrib's Java protos. See `contrib/example`'s entry in `spark/pom.xml` for + the verbatim block to copy. ### Native side @@ -880,10 +846,12 @@ size you need and the use case — the cap is a guardrail, not a feature. ## Registry implementation note -The native contrib planner registry is currently a `RwLock>>`. -Lookups happen once per `ContribOp` plan call; writes happen only during library init. -The implementation may switch to a lock-free primitive (`ArcSwap`) in a future release -if profiling shows the read path matters; the public API stays unchanged either way. +The native contrib planner registry uses `ArcSwap>>` — +lock-free for readers, RCU swap for writers. Reads on the `ContribOp` dispatch hot +path are a single atomic load plus an `Arc` ref-count bump; there is no +reader-writer contention because writes happen exclusively during library init +(sequential `#[ctor]` registrations, no concurrent writers). Contribs never call +the registry primitives directly. ## See also diff --git a/native/Cargo.lock b/native/Cargo.lock index 289d1ff095..187a75665a 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1502,6 +1502,7 @@ dependencies = [ name = "comet-contrib-spi" version = "0.17.0" dependencies = [ + "arc-swap", "datafusion", "datafusion-comet-proto", "log", diff --git a/native/contrib-spi/Cargo.toml b/native/contrib-spi/Cargo.toml index 29fde99b5c..fbead2c17e 100644 --- a/native/contrib-spi/Cargo.toml +++ b/native/contrib-spi/Cargo.toml @@ -32,6 +32,9 @@ datafusion-comet-proto = { workspace = true } # Surface the `Path` type on the SPI's prepare_object_store return value. object_store = { workspace = true } log = "0.4" +# Lock-free registry primitive. Reads (per ContribOp dispatch, hot path) are one atomic +# load + ref-count bump; writes (per contrib's #[ctor] at lib init) are an RCU swap. +arc-swap = "1" [features] # Off by default. When enabled, the crate exposes `ScopedContribPlannerRegistration` and diff --git a/native/contrib-spi/src/lib.rs b/native/contrib-spi/src/lib.rs index f92c6dde93..bf5348fa2c 100644 --- a/native/contrib-spi/src/lib.rs +++ b/native/contrib-spi/src/lib.rs @@ -36,9 +36,11 @@ use std::{ collections::HashMap, - sync::{Arc, OnceLock, RwLock}, + sync::{Arc, OnceLock}, }; +use arc_swap::ArcSwap; + use datafusion::{ arrow::datatypes::SchemaRef, common::ScalarValue, @@ -279,10 +281,18 @@ impl std::fmt::Display for ContribError { impl std::error::Error for ContribError {} /// Process-wide registry of contrib operator planners, keyed by `ContribOp.kind`. -fn registry() -> &'static RwLock>> { - static REGISTRY: OnceLock>>> = - OnceLock::new(); - REGISTRY.get_or_init(|| RwLock::new(HashMap::new())) +/// +/// `ArcSwap>` gives lock-free reads (one atomic load + Arc ref-count bump) +/// on the dispatch hot path. Writes happen exclusively during library init from +/// `#[ctor]`s (sequential, single-threaded) and use `rcu` to swap an updated map atom. +/// The init-once / read-many access pattern is exactly what `ArcSwap` is designed for; +/// the previous `RwLock` would have introduced reader-writer contention for +/// no gain since there are effectively no concurrent writes. +type RegistryMap = HashMap>; + +fn registry() -> &'static ArcSwap { + static REGISTRY: OnceLock> = OnceLock::new(); + REGISTRY.get_or_init(|| ArcSwap::from_pointee(HashMap::new())) } /// Register a contrib operator planner under the given `kind` identifier. Last-write-wins @@ -293,33 +303,29 @@ pub fn register_contrib_planner( planner: Arc, ) { let kind = kind.into(); - let mut guard = registry() - .write() - .expect("contrib planner registry poisoned"); - if guard.contains_key(&kind) { - log::warn!( - "register_contrib_planner: replacing existing planner for kind={kind:?}; \ - second registration usually indicates a misconfigured test harness" - ); - } - guard.insert(kind, planner); + registry().rcu(|current| { + let mut new_map: RegistryMap = (**current).clone(); + if new_map.contains_key(&kind) { + log::warn!( + "register_contrib_planner: replacing existing planner for kind={kind:?}; \ + second registration usually indicates a misconfigured test harness" + ); + } + new_map.insert(kind.clone(), Arc::clone(&planner)); + new_map + }); } /// Look up the contrib planner registered for `kind`, or `None` if no contrib is loaded /// for that operator. Core's dispatcher uses this to route `OpStruct::ContribOp` payloads. pub fn lookup_contrib_planner_by_kind(kind: &str) -> Option> { - let guard = registry() - .read() - .expect("contrib planner registry poisoned"); - guard.get(kind).cloned() + registry().load().get(kind).cloned() } /// Return a snapshot of all registered contrib kinds, for diagnostics and tests. pub fn registered_contrib_kinds() -> Vec { - let guard = registry() - .read() - .expect("contrib planner registry poisoned"); - let mut kinds: Vec = guard.keys().cloned().collect(); + let snapshot = registry().load(); + let mut kinds: Vec = snapshot.keys().cloned().collect(); kinds.sort(); kinds } @@ -343,10 +349,14 @@ impl ScopedContribPlannerRegistration { /// previously-registered planner (if any) is restored on drop. pub fn new(kind: impl Into, planner: Arc) -> Self { let kind = kind.into(); - let mut guard = registry() - .write() - .expect("contrib planner registry poisoned"); - let previous = guard.insert(kind.clone(), planner); + // Snapshot the previous binding BEFORE the rcu so retries (under contention) don't + // observe our own write as the previous value. + let previous = registry().load().get(&kind).cloned(); + registry().rcu(|current| { + let mut new_map: RegistryMap = (**current).clone(); + new_map.insert(kind.clone(), Arc::clone(&planner)); + new_map + }); Self { kind, previous } } } @@ -354,17 +364,20 @@ impl ScopedContribPlannerRegistration { #[cfg(any(test, feature = "test-utils"))] impl Drop for ScopedContribPlannerRegistration { fn drop(&mut self) { - let mut guard = registry() - .write() - .expect("contrib planner registry poisoned"); - match self.previous.take() { - Some(prev) => { - guard.insert(self.kind.clone(), prev); - } - None => { - guard.remove(&self.kind); + let kind = std::mem::take(&mut self.kind); + let previous = self.previous.take(); + registry().rcu(|current| { + let mut new_map: RegistryMap = (**current).clone(); + match &previous { + Some(prev) => { + new_map.insert(kind.clone(), Arc::clone(prev)); + } + None => { + new_map.remove(&kind); + } } - } + new_map + }); } } @@ -373,10 +386,7 @@ impl Drop for ScopedContribPlannerRegistration { /// removes the entries every other concurrent test depends on. #[cfg(any(test, feature = "test-utils"))] pub fn _clear_for_test() { - let mut guard = registry() - .write() - .expect("contrib planner registry poisoned"); - guard.clear(); + registry().store(Arc::new(HashMap::new())); } #[cfg(test)] diff --git a/pom.xml b/pom.xml index 5778508553..74dc5720c1 100644 --- a/pom.xml +++ b/pom.xml @@ -39,13 +39,22 @@ under the License. spark spark-integration + contrib/example diff --git a/spark/pom.xml b/spark/pom.xml index 98e9a8c6fe..f26f2f50bc 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -371,6 +371,21 @@ under the License. --> contrib-example + + + + org.apache.datafusion + comet-contrib-example-deps-spark${spark.version.short}_${scala.binary.version} + ${project.version} + pom + + + **/native/src/generated/** + **/META-INF/services/** benchmarks/tpc/queries/** .claude/** diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index 94d7465938..9ffd94a635 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -357,9 +357,20 @@ case class CometExecRule(session: SparkSession) // that aren't in `allExecs`, so this merge never overrides a core mapping in // practice; duplicate-class detection at load() time logs a warning if it // does happen. + // Three-step dispatch: + // 1. core's built-in class-keyed map (allExecs) + // 2. contrib serde-extensions' class-keyed map (mergedSerdes) + // 3. contrib serde-extensions' predicate-based matchOperator hook + // (for marker-class patterns where one shared SparkPlan class -- + // e.g. CometScanExec -- is disambiguated by a runtime tag) val handler = allExecs .get(op.getClass) .orElse(CometExtensionRegistry.mergedSerdes.get(op.getClass)) + .orElse { + CometExtensionRegistry.serdeExtensions.iterator + .flatMap(_.matchOperator(op)) + .nextOption() + } .map(_.asInstanceOf[CometOperatorSerde[SparkPlan]]) handler match { case Some(handler) => diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala index 52621bdc8a..64d555a80f 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala @@ -780,33 +780,14 @@ case class CometScanRule(session: SparkSession) case _ => false } + // Delegate to the companion object's pure helper so the implementation lives in one + // place. Kept as a class-level method so existing in-class callers (transformV1Scan, + // transformV2Scan) compile unchanged. private def isSchemaSupported( scanExec: FileSourceScanExec, scanImpl: String, - r: HadoopFsRelation): Boolean = { - val fallbackReasons = new ListBuffer[String]() - val typeChecker = CometScanTypeChecker(scanImpl) - val schemaSupported = - typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons) - if (!schemaSupported) { - withInfo( - scanExec, - s"Unsupported schema ${scanExec.requiredSchema} " + - s"for $scanImpl: ${fallbackReasons.mkString(", ")}") - return false - } - val partitionSchemaSupported = - typeChecker.isSchemaSupported(r.partitionSchema, fallbackReasons) - if (!partitionSchemaSupported) { - withInfo( - scanExec, - s"Unsupported partitioning schema ${scanExec.requiredSchema} " + - s"for $scanImpl: ${fallbackReasons - .mkString(", ")}") - return false - } - true - } + r: HadoopFsRelation): Boolean = + CometScanRule.isSchemaSupported(scanExec, scanImpl, r) } case class CometScanTypeChecker(scanImpl: String) extends DataTypeSupport with CometTypeShim { @@ -846,6 +827,39 @@ case class CometScanTypeChecker(scanImpl: String) extends DataTypeSupport with C object CometScanRule extends Logging { + /** + * Schema-support check + fallback-reason emission, callable from contrib extensions under + * `org.apache.comet.contrib.*`. Pure function; no shared state with CometScanRule instances. + * `private[comet]` keeps it out of the public API while letting subpackages (contribs) reach + * it. + */ + private[comet] def isSchemaSupported( + scanExec: FileSourceScanExec, + scanImpl: String, + r: HadoopFsRelation): Boolean = { + val fallbackReasons = new ListBuffer[String]() + val typeChecker = CometScanTypeChecker(scanImpl) + val schemaSupported = + typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons) + if (!schemaSupported) { + org.apache.comet.CometSparkSessionExtensions.withInfo( + scanExec, + s"Unsupported schema ${scanExec.requiredSchema} " + + s"for $scanImpl: ${fallbackReasons.mkString(", ")}") + return false + } + val partitionSchemaSupported = + typeChecker.isSchemaSupported(r.partitionSchema, fallbackReasons) + if (!partitionSchemaSupported) { + org.apache.comet.CometSparkSessionExtensions.withInfo( + scanExec, + s"Unsupported partitioning schema ${scanExec.requiredSchema} " + + s"for $scanImpl: ${fallbackReasons.mkString(", ")}") + return false + } + true + } + /** * Tag set on a scan (`FileSourceScanExec` or `BatchScanExec`) that should be left as a plain * Spark scan rather than converted to a Comet scan. Written by diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index 90de30c515..10e76bf36a 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -33,17 +33,17 @@ import org.apache.spark.internal.Logging * comet-spark.jar. Those entries get there at build time: each contrib (under `contrib//`) * carries its own `META-INF/services/` files, and the `contrib-` Maven profile on * spark/pom.xml shades the contrib's classes plus those service entries into the published - * comet-spark.jar. A vanilla `mvn install` produces a comet-spark.jar with zero contribs; a - * `mvn install -Pcontrib-example` build bundles the example contrib. The native side mirrors - * this exactly via `--features contrib-example` on the Rust core crate. + * comet-spark.jar. A vanilla `mvn install` produces a comet-spark.jar with zero contribs; a `mvn + * install -Pcontrib-example` build bundles the example contrib. The native side mirrors this + * exactly via `--features contrib-example` on the Rust core crate. * * Discovery is idempotent: the first `load()` call enumerates the service entries; subsequent * calls are no-ops. `load()` is invoked lazily from `CometScanRule._apply` and - * `CometExecRule._apply` the first time either rule runs against a Comet-enabled session. - * Spark sessions that never enable Comet pay zero ServiceLoader cost. + * `CometExecRule._apply` the first time either rule runs against a Comet-enabled session. Spark + * sessions that never enable Comet pay zero ServiceLoader cost. * - * Failures to instantiate individual extensions are logged at WARN but do NOT fail Comet - * startup -- a misconfigured contrib shouldn't take down the whole Spark session. + * Failures to instantiate individual extensions are logged at WARN but do NOT fail Comet startup + * -- a misconfigured contrib shouldn't take down the whole Spark session. */ object CometExtensionRegistry extends Logging { @@ -71,12 +71,14 @@ object CometExtensionRegistry extends Logging { val newScanExts = loadOne[CometScanRuleExtension]("CometScanRuleExtension") val newSerdeExts = loadOne[CometOperatorSerdeExtension]("CometOperatorSerdeExtension") val newMerged = newSerdeExts.flatMap(_.serdes).toMap + val newNativeParquetTags = newSerdeExts.flatMap(_.nativeParquetScanImpls).toSet // Publish the @volatile fields BEFORE flipping `loaded` so other threads either see // the empty defaults (and may re-enter -- benign, blocked by the monitor) or the // fully-populated state (and may skip -- also benign). scanExts = newScanExts serdeExts = newSerdeExts mergedSerdesCache = newMerged + nativeParquetScanImplsCache = newNativeParquetTags loaded.set(true) if (newScanExts.nonEmpty || newSerdeExts.nonEmpty) { logInfo( @@ -107,12 +109,23 @@ object CometExtensionRegistry extends Logging { * the contrib uses for class-keyed dispatch in `CometExecRule`. Computed once at `load()` time; * an empty map until `load()` has run. */ - def mergedSerdes: Map[Class[_ <: org.apache.spark.sql.execution.SparkPlan], + def mergedSerdes: Map[ + Class[_ <: org.apache.spark.sql.execution.SparkPlan], org.apache.comet.serde.CometOperatorSerde[_]] = mergedSerdesCache - @volatile private var mergedSerdesCache - : Map[Class[_ <: org.apache.spark.sql.execution.SparkPlan], - org.apache.comet.serde.CometOperatorSerde[_]] = Map.empty + @volatile private var mergedSerdesCache: Map[ + Class[_ <: org.apache.spark.sql.execution.SparkPlan], + org.apache.comet.serde.CometOperatorSerde[_]] = Map.empty + + /** + * Union of every registered extension's `nativeParquetScanImpls`. Consumed by + * `CometScanExec.supportedDataFilters` to decide whether the marker scan's filter set should + * get the same native-parquet exclusions as `SCAN_NATIVE_DATAFUSION`. Computed once at `load()` + * time; empty until `load()` has run. + */ + def nativeParquetScanImpls: Set[String] = nativeParquetScanImplsCache + + @volatile private var nativeParquetScanImplsCache: Set[String] = Set.empty /** * Log a warning when two registered contribs claim the same `Class[_ <: SparkPlan]` for serde @@ -126,7 +139,9 @@ object CometExtensionRegistry extends Logging { */ private def detectDuplicateSerdeClasses(exts: Seq[CometOperatorSerdeExtension]): Unit = { val perClassOwners = scala.collection.mutable.Map - .empty[Class[_ <: org.apache.spark.sql.execution.SparkPlan], scala.collection.mutable.ArrayBuffer[String]] + .empty[ + Class[_ <: org.apache.spark.sql.execution.SparkPlan], + scala.collection.mutable.ArrayBuffer[String]] exts.foreach { ext => ext.serdes.keys.foreach { cls => perClassOwners @@ -149,10 +164,10 @@ object CometExtensionRegistry extends Logging { * Test-only: reset the registry to the empty state. Lets unit tests re-run discovery with a * different classpath / overridden services. Not for production use. * - * Visibility is `public` (rather than `private[comet]`) because contribs are not required to - * be packaged under `org.apache.comet.*`; a contrib living under e.g. `io.delta.comet.contrib` - * must still be able to reset between tests. The method's name carries the "test-only" - * contract by convention. + * Visibility is `public` (rather than `private[comet]`) because contribs are not required to be + * packaged under `org.apache.comet.*`; a contrib living under e.g. `io.delta.comet.contrib` + * must still be able to reset between tests. The method's name carries the "test-only" contract + * by convention. */ def resetForTesting(): Unit = synchronized { // synchronized so concurrent `load()` callers don't observe torn state -- e.g. @@ -162,6 +177,7 @@ object CometExtensionRegistry extends Logging { scanExts = Seq.empty serdeExts = Seq.empty mergedSerdesCache = Map.empty + nativeParquetScanImplsCache = Set.empty } private def loadOne[T](label: String)(implicit ct: scala.reflect.ClassTag[T]): Seq[T] = { diff --git a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala index 9b180523ef..d66ef48d06 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala @@ -49,5 +49,44 @@ trait CometOperatorSerdeExtension { * Convention: each contrib's mapping should reference only classes the contrib itself defines, * so two contribs never claim ownership of the same operator class. */ - def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] + def serdes: Map[Class[_ <: SparkPlan], CometOperatorSerde[_]] = Map.empty + + /** + * Predicate-based dispatch hook for contribs whose serde key cannot be expressed as a unique + * `SparkPlan` class. The canonical case is the `CometScanExec` marker-with-`scanImpl`-tag + * pattern: a contrib's `CometScanRuleExtension.transformV1` returns `CometScanExec(scanExec, + * session, "my-contrib-tag")`, but `CometScanExec` is a case class shared with core, so a + * class-keyed map can't disambiguate by the tag. The contrib overrides this method to inspect + * the plan and return its serde: + * + * {{{ + * private val MyScanImpl = "native_myformat_compat" // contrib-local constant + * + * override def matchOperator(op: SparkPlan): Option[CometOperatorSerde[_]] = op match { + * case s: CometScanExec if s.scanImpl == MyScanImpl => Some(CometMyFormatScan) + * case _ => None + * } + * }}} + * + * `CometExecRule` consults `matchOperator` only after the class-keyed `serdes` map misses, so + * contribs with a unique exec class never need to implement this. Multiple registered + * extensions' `matchOperator` returns are tried in registration order; the first `Some` wins. + */ + def matchOperator(op: SparkPlan): Option[CometOperatorSerde[_]] = None + + /** + * Declares which `scanImpl` string tags this contrib produces from + * `CometScanRuleExtension.transformV1` when using the `CometScanExec(marker, scanImpl=X)` + * pattern. Tags listed here get `CometScanExec.supportedDataFilters`'s native-parquet filter + * exclusions (drop dynamic pruning + IsNull/IsNotNull on ArrayType columns), the same treatment + * `SCAN_NATIVE_DATAFUSION` receives. + * + * Override only if your contrib uses the marker-class pattern AND your native side goes through + * Comet's tuned `ParquetSource`. Contribs that define their own `SparkPlan` subclass (rather + * than reusing `CometScanExec`) don't need this; they control filter selection themselves. + * + * Example: a Delta contrib that uses `CometScanExec(..., scanImpl="native_delta_compat")` would + * override this to `Set("native_delta_compat")`. + */ + def nativeParquetScanImpls: Set[String] = Set.empty } diff --git a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala index db57d17eb2..9c273bf47f 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometScanRuleExtension.scala @@ -31,12 +31,12 @@ import org.apache.spark.sql.execution.datasources.v2.BatchScanExec * * `CometScanRule` discovers implementations via `CometExtensionRegistry.scanExtensions` * (ServiceLoader-backed) and offers each candidate scan to every registered extension in - * registration order. The first extension whose [[matchesV1]] (or [[matchesV2]]) returns true - * AND whose [[transformV1]] (or [[transformV2]]) returns `Some(_)` wins -- its returned plan - * replaces the scan subtree. An extension whose `matches` is true but whose `transform` returns - * `None` is treated as "declined this instance"; dispatch continues to the next matching - * extension. After every matching extension has declined, core's built-in file-format dispatch - * handles the scan as before. + * registration order. The first extension whose [[matchesV1]] (or [[matchesV2]]) returns true AND + * whose [[transformV1]] (or [[transformV2]]) returns `Some(_)` wins -- its returned plan replaces + * the scan subtree. An extension whose `matches` is true but whose `transform` returns `None` is + * treated as "declined this instance"; dispatch continues to the next matching extension. After + * every matching extension has declined, core's built-in file-format dispatch handles the scan as + * before. * * Contribs are discovered via the standard Java ServiceLoader. Each contrib JAR ships a * `META-INF/services/org.apache.comet.spi.CometScanRuleExtension` resource listing its extension @@ -56,16 +56,16 @@ trait CometScanRuleExtension { * Tree-level pre-pass run once per plan before per-scan dispatch begins. Default: identity. * * Use this to undo wrapper rewrites that a format's own Catalyst strategy applied. The - * canonical example is Delta's `PreprocessTableWithDVs` strategy, which wraps every - * DV-bearing Delta scan in a `Project(Filter(...))` subtree referencing a synthetic - * `__delta_internal_is_row_deleted` column produced by Delta's own reader. Comet reads via - * its own parquet path; without unwrapping that subtree, the synthetic column never gets - * produced and the downstream `Filter` silently drops every row. The Delta contrib's - * `preTransform` strips the wrapper so the clean scan reaches per-scan dispatch. + * canonical example is Delta's `PreprocessTableWithDVs` strategy, which wraps every DV-bearing + * Delta scan in a `Project(Filter(...))` subtree referencing a synthetic + * `__delta_internal_is_row_deleted` column produced by Delta's own reader. Comet reads via its + * own parquet path; without unwrapping that subtree, the synthetic column never gets produced + * and the downstream `Filter` silently drops every row. The Delta contrib's `preTransform` + * strips the wrapper so the clean scan reaches per-scan dispatch. * - * '''V1-only.''' `preTransform` runs once for the whole plan and the rewritten tree is - * what later `transformV1` calls see via their `plan` argument. `transformV2` does NOT - * receive a plan-tree reference -- only the matched `BatchScanExec`. V2 contribs that need + * '''V1-only.''' `preTransform` runs once for the whole plan and the rewritten tree is what + * later `transformV1` calls see via their `plan` argument. `transformV2` does NOT receive a + * plan-tree reference -- only the matched `BatchScanExec`. V2 contribs that need * wrapper-stripping must do that work inside `transformV2` against `scanExec.scan` / * `scanExec.children` directly. * @@ -73,18 +73,18 @@ trait CometScanRuleExtension { * fold when `spark.comet.scan.enabled=false`. A contrib's own wrappers (Delta's DV filter, * etc.) are load-bearing in that case; stripping them turns into a correctness bug. * - * '''MUST NOT modify scans the extension does not recognise.''' Multiple registered - * extensions are folded over the plan in registration order; an extension that rewrites - * scans outside its format's domain will silently corrupt other formats' plans. - * `CometScanRule` logs a warning when a `FileSourceScanExec` is replaced by an extension - * whose `matchesV1` returns false against the original scan's relation -- contribs that - * trip this warning should narrow their pattern match. + * '''MUST NOT modify scans the extension does not recognise.''' Multiple registered extensions + * are folded over the plan in registration order; an extension that rewrites scans outside its + * format's domain will silently corrupt other formats' plans. `CometScanRule` logs a warning + * when a `FileSourceScanExec` is replaced by an extension whose `matchesV1` returns false + * against the original scan's relation -- contribs that trip this warning should narrow their + * pattern match. * - * '''State sharing.''' Shared state between this pre-pass and later `transformV1` calls - * is the contrib's problem. The recommended pattern is to attach a Spark `TreeNodeTag` - * to nodes during `preTransform` and read it during `transformV1`. Spark's tag mechanism - * is tree-immutable-safe and survives plan transformations -- preferred over external - * mutable state which leaks across plans. + * '''State sharing.''' Shared state between this pre-pass and later `transformV1` calls is the + * contrib's problem. The recommended pattern is to attach a Spark `TreeNodeTag` to nodes during + * `preTransform` and read it during `transformV1`. Spark's tag mechanism is tree-immutable-safe + * and survives plan transformations -- preferred over external mutable state which leaks across + * plans. */ def preTransform(plan: SparkPlan, session: SparkSession): SparkPlan = plan @@ -101,10 +101,10 @@ trait CometScanRuleExtension { * Transform the matched V1 scan. Called only when `matchesV1` returned true. * * Returning `None` means "I matched the scan shape but ultimately can't accelerate this - * specific instance" -- `CometScanRule` then continues to the NEXT registered extension - * whose `matchesV1` is true, falling back to core's built-in file-format dispatch only - * after every matching extension has declined. Returning `Some(plan)` ends dispatch and - * replaces the scan subtree with `plan`. + * specific instance" -- `CometScanRule` then continues to the NEXT registered extension whose + * `matchesV1` is true, falling back to core's built-in file-format dispatch only after every + * matching extension has declined. Returning `Some(plan)` ends dispatch and replaces the scan + * subtree with `plan`. */ def transformV1( plan: SparkPlan, @@ -122,10 +122,9 @@ trait CometScanRuleExtension { * Transform the matched V2 scan. Called only when `matchesV2` returned true. * * Same semantics as `transformV1`: `None` falls through to the next matching extension; - * `Some(plan)` ends dispatch. Note that unlike `transformV1`, this method does NOT - * receive a plan-tree reference -- `preTransform` rewrites are not visible here. V2 - * contribs that need wrapper-stripping must operate on `scanExec.scan` / - * `scanExec.children` directly. + * `Some(plan)` ends dispatch. Note that unlike `transformV1`, this method does NOT receive a + * plan-tree reference -- `preTransform` rewrites are not visible here. V2 contribs that need + * wrapper-stripping must operate on `scanExec.scan` / `scanExec.children` directly. */ def transformV2(scanExec: BatchScanExec, session: SparkSession): Option[SparkPlan] = None } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala index 652fdfc96d..7e9b2bfa80 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala @@ -159,7 +159,13 @@ case class CometScanExec( * on array columns (see [[isNullCheckOnArrayColumn]]). */ lazy val supportedDataFilters: Seq[Expression] = { - if (scanImpl == CometConf.SCAN_NATIVE_DATAFUSION) { + // Contribs that use the CometScanExec marker pattern with their own scanImpl + // string can declare that their scan goes through Comet's tuned ParquetSource + // (and therefore wants DataFusion-style filter exclusions) by registering the + // tag via `CometOperatorSerdeExtension.nativeParquetScanImpls`. Core doesn't + // need to know any contrib's marker name; the registry is the source of truth. + if (scanImpl == CometConf.SCAN_NATIVE_DATAFUSION || + CometScanExec.contribNativeParquetScanImpls.contains(scanImpl)) { dataFilters .filterNot(isDynamicPruningFilter) .filterNot(isNullCheckOnArrayColumn) @@ -534,6 +540,17 @@ case class CometScanExec( object CometScanExec { + /** + * Set of contrib-registered scanImpl tags whose CometScanExec should use Comet's native-parquet + * filter exclusion semantics (drop dynamic pruning + IsNull/IsNotNull on ArrayType columns). + * Populated lazily from + * `CometExtensionRegistry.serdeExtensions.flatMap(_.nativeParquetScanImpls)`. Each access + * re-reads the volatile field on `CometExtensionRegistry`; the cost is one HashSet lookup per + * CometScanExec construction, which is dwarfed by Spark's own per-plan work. + */ + private[comet] def contribNativeParquetScanImpls: Set[String] = + org.apache.comet.spi.CometExtensionRegistry.nativeParquetScanImpls + def apply( scanExec: FileSourceScanExec, session: SparkSession, From e417211447a396f9a1d9b7ca76d89a48262dc853 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Fri, 15 May 2026 14:25:02 -0400 Subject: [PATCH 22/27] feat(contrib): partition-metadata SPI hook + matchOperator marker dispatch Two related additions to the contrib SPI surface, both driven by gaps the Delta regression exposed: 1. Generic per-partition metadata hook on `CometExecRDD` so contribs can populate executor-side thread-locals (e.g. `InputFileBlockHolder` for `input_file_name()` / `_metadata.file_path`) from their serialized per-partition payloads BEFORE the native iterator starts producing rows. Without this, Delta's UPDATE/DELETE/MERGE/CDC commands resolve `_metadata.file_path` to empty and throw `DELTA_FILE_TO_OVERWRITE_NOT_FOUND` for every touched file. Three pieces: - `CometExecRDD.PartitionMetadataHandler` type alias + `registerPartitionMetadataHandler` SPI. The signature takes the `Map[String, Array[Byte]]` data shape (NOT the spark-internal `CometExecPartition`), so contribs don't have to live under `org.apache.spark.*` to use it. - `CometExecRDD.compute()` invokes handlers after plan-data injection, before instantiating the native iterator. - `clearPartitionMetadataHandlers()` for test isolation; called from `CometExtensionRegistry.resetForTesting`. Lifecycle hook: `CometOperatorSerdeExtension.init(): Unit` (default no-op), called once per JVM by `CometExtensionRegistry.load` after discovery. Contribs override to register their handler. Failures in one contrib's `init` are caught and logged so they don't take down sibling contribs. 2. `matchOperator` dispatch now fires for `CometScanExec` markers tagged with a registered contrib's `scanImpl` (anything in `nativeParquetScanImpls`). Before this, the generic `case op if isCometScan(op)` branch matched first and routed the marker through `CometScanWrapper`, so the contrib's `serialize` (and any format-specific concerns inside it -- Delta column-mapping physical-name substitution, etc.) was never reached. The dispatch order is now: `SCAN_NATIVE_DATAFUSION` -> contrib-marker via `matchOperator` -> generic `isCometScan` catch. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/comet/rules/CometExecRule.scala | 21 +++++++ .../comet/spi/CometExtensionRegistry.scala | 16 +++++ .../spi/CometOperatorSerdeExtension.scala | 13 ++++ .../apache/spark/sql/comet/CometExecRDD.scala | 60 +++++++++++++++++++ 4 files changed, 110 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index 9ffd94a635..16ddb5a028 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -271,6 +271,27 @@ case class CometExecRule(session: SparkSession) case scan: CometBatchScanExec if scan.wrapped.scan.isInstanceOf[CSVScan] => convertToComet(scan, CometCsvNativeScanExec).getOrElse(scan) + // Contrib marker dispatch: a `CometScanExec` tagged with a contrib's `scanImpl` (i.e. + // listed in `nativeParquetScanImpls`) goes through the contrib's `matchOperator`-keyed + // serde rather than the generic `CometScanWrapper` below. Without this, marker scans + // would only get JVM-side parquet bytes-reuse, never reaching the contrib's + // `serialize` and therefore missing format-specific concerns like Delta column + // mapping. Dispatch order: explicit `scanImpl == SCAN_NATIVE_DATAFUSION` (line above), + // then this contrib-marker case, then the generic `isCometScan` catch. + case scan: CometScanExec + if CometExtensionRegistry.nativeParquetScanImpls.contains(scan.scanImpl) => + val handler = CometExtensionRegistry.serdeExtensions.iterator + .flatMap(_.matchOperator(scan)) + .nextOption() + .map(_.asInstanceOf[CometOperatorSerde[SparkPlan]]) + handler match { + case Some(h) => convertToComet(scan, h).getOrElse(scan) + // Fall back to the wrapper if no contrib claims the marker -- preserves the + // current behaviour when a build bundles `nativeParquetScanImpls` but no matching + // matchOperator (shouldn't happen in practice but is the safe default). + case None => convertToComet(scan, CometScanWrapper).getOrElse(scan) + } + // Comet JVM + native scan for V1 and V2 case op if isCometScan(op) => convertToComet(op, CometScanWrapper).getOrElse(op) diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index 10e76bf36a..aec3796bf6 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -80,6 +80,18 @@ object CometExtensionRegistry extends Logging { mergedSerdesCache = newMerged nativeParquetScanImplsCache = newNativeParquetTags loaded.set(true) + // Call `init()` AFTER publishing the volatile fields and flipping `loaded`. This lets + // an extension's `init` synchronously call back into the registry (e.g. to read its + // sibling extensions) without observing a half-built state, and it lets `init` register + // executor-side callbacks on `CometExecRDD` without racing the first compute call. + // Failures are isolated per extension so one broken contrib doesn't take down the others. + newSerdeExts.foreach { ext => + try ext.init() + catch { + case scala.util.control.NonFatal(e) => + logWarning(s"CometOperatorSerdeExtension '${ext.name}' init failed; continuing", e) + } + } if (newScanExts.nonEmpty || newSerdeExts.nonEmpty) { logInfo( s"Comet contrib extensions loaded: " + @@ -178,6 +190,10 @@ object CometExtensionRegistry extends Logging { serdeExts = Seq.empty mergedSerdesCache = Map.empty nativeParquetScanImplsCache = Set.empty + // Also clear any executor-side callbacks registered via the SPI's `init` hook so the + // next `load()` re-registers from scratch. Without this the test that exercises + // `resetForTesting` + `load` would accumulate handlers across reset boundaries. + org.apache.spark.sql.comet.CometExecRDD.clearPartitionMetadataHandlers() } private def loadOne[T](label: String)(implicit ct: scala.reflect.ClassTag[T]): Seq[T] = { diff --git a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala index d66ef48d06..345271f9a4 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometOperatorSerdeExtension.scala @@ -89,4 +89,17 @@ trait CometOperatorSerdeExtension { * override this to `Set("native_delta_compat")`. */ def nativeParquetScanImpls: Set[String] = Set.empty + + /** + * One-shot initialization hook invoked exactly once per JVM by `CometExtensionRegistry.load` + * after this extension has been instantiated. Use to register executor-side callbacks that + * can't be expressed declaratively in the `serdes` map -- e.g. a per-partition metadata + * handler on `CometExecRDD.registerPartitionMetadataHandler` for populating Spark + * thread-locals from a contrib's serialized per-partition payload. + * + * Default no-op so existing extensions don't have to opt in. Implementations MUST be safe + * to call once per JVM (e.g. don't accumulate state across query executions). Failures are + * logged and isolated: a broken `init` on one contrib doesn't take down the others. + */ + def init(): Unit = () } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala index 47eda98a11..072ff6d2bf 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometExecRDD.scala @@ -111,6 +111,16 @@ private[spark] class CometExecRDD( serializedPlan } + // Invoke registered per-partition metadata handlers. This is the SPI hook contribs + // use to populate executor thread-locals (e.g. `InputFileBlockHolder`) from their + // serialized per-partition payloads before the native iterator runs. The Delta + // contrib uses it so `input_file_name()` and Delta's `_metadata.file_path` resolve + // correctly; without this, UPDATE/DELETE/MERGE/CDC paths see empty file_path and + // throw `DELTA_FILE_TO_OVERWRITE_NOT_FOUND`. Handlers are called for every + // partition with non-empty plan data and are expected to no-op when the partition + // doesn't carry their proto. The registered handlers list is a `@volatile` read. + CometExecRDD.runPartitionMetadataHandlers(partition.planDataByKey, context) + // Create shuffle block iterators for inputs that are CometShuffledBatchRDD val shuffleBlockIters = shuffleScanIndices.flatMap { idx => inputRDDs(idx) match { @@ -163,6 +173,56 @@ private[spark] class CometExecRDD( object CometExecRDD { + /** + * SPI hook signature: a callback contribs register to inspect a partition's per-partition + * planning data BEFORE the native iterator starts producing rows on this task. Receives the + * `Map[String, Array[Byte]]` of serialized per-partition payloads keyed by `sourceKey` (the + * same shape contribs serialize into `perPartitionByKey` at planning time). Plus the active + * `TaskContext` so handlers can register completion listeners. + * + * Canonical use: the Delta contrib reads its `DeltaScan` payload, extracts the AddFile path, + * and calls `InputFileBlockHolder.set` so `input_file_name()` and Delta's `_metadata.file_path` + * resolve to the file being read (otherwise UPDATE/DELETE/MERGE/CDC throw + * `DELTA_FILE_TO_OVERWRITE_NOT_FOUND`). + * + * The signature is deliberately the data shape, NOT a Spark-internal partition type, so contribs + * don't have to live under `org.apache.spark.*` to see it. Handlers MUST: + * - be stateless and free of contrib-specific assumptions on partitions that don't carry + * their proto (no-op silently when their expected key/payload shape isn't present), + * - register a task-completion listener for any thread-local they set, so the value is + * cleared at the end of the task, and + * - tolerate parse failures defensively -- another contrib may own this key. + */ + type PartitionMetadataHandler = (Map[String, Array[Byte]], TaskContext) => Unit + + @volatile private var partitionMetadataHandlers: Vector[PartitionMetadataHandler] = Vector.empty + + /** + * Register a per-partition metadata handler. Called once per contrib at extension-load + * time (from `CometOperatorSerdeExtension.init`). Registration is idempotent on the + * same function reference but does not de-duplicate equivalent lambdas; contribs are + * expected to register exactly once. + */ + def registerPartitionMetadataHandler(h: PartitionMetadataHandler): Unit = synchronized { + if (!partitionMetadataHandlers.contains(h)) { + partitionMetadataHandlers = partitionMetadataHandlers :+ h + } + } + + /** + * Test-only / contrib reset. Visibility is `public` to mirror `resetForTesting` on the registry. + */ + def clearPartitionMetadataHandlers(): Unit = synchronized { + partitionMetadataHandlers = Vector.empty + } + + private[comet] def runPartitionMetadataHandlers( + planDataByKey: Map[String, Array[Byte]], + context: TaskContext): Unit = { + val hs = partitionMetadataHandlers + if (hs.nonEmpty) hs.foreach(_(planDataByKey, context)) + } + /** * Creates an RDD for native execution with optional per-partition planning data. */ From 35f1b3bb37fde4aa68c633600cd4b2290fecfc14 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Fri, 15 May 2026 15:34:59 -0400 Subject: [PATCH 23/27] revert: matchOperator dispatch for contrib markers Revert the `case scan: CometScanExec if nativeParquetScanImpls.contains(...)` branch added in e4172114. Unconditionally routing the contrib marker through the contrib's `matchOperator` -> serde regressed ~525 previously-passing Delta tests: the full conversion path (`CometDeltaNativeScan.serialize` -> `CometDeltaNativeScanExec`) returns 0 rows for the streaming-source `PreparedDeltaFileIndex` shape that the existing JVM-side `CometScanWrapper` path handles correctly. The motivation for the dispatch change (CM-name post-rename returning wrong values) still stands but needs a different fix: the marker-passthrough path has to apply Delta's logical->physical name substitution itself rather than forcing every Delta scan through the heavier kernel path. Left as a TODO. The partition-metadata SPI hook (also added in e4172114) is unaffected by this revert and remains in place -- it's what addresses the broader UPDATE/DELETE/MERGE/CDC failures via `InputFileBlockHolder` population. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/comet/rules/CometExecRule.scala | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index 16ddb5a028..9ffd94a635 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -271,27 +271,6 @@ case class CometExecRule(session: SparkSession) case scan: CometBatchScanExec if scan.wrapped.scan.isInstanceOf[CSVScan] => convertToComet(scan, CometCsvNativeScanExec).getOrElse(scan) - // Contrib marker dispatch: a `CometScanExec` tagged with a contrib's `scanImpl` (i.e. - // listed in `nativeParquetScanImpls`) goes through the contrib's `matchOperator`-keyed - // serde rather than the generic `CometScanWrapper` below. Without this, marker scans - // would only get JVM-side parquet bytes-reuse, never reaching the contrib's - // `serialize` and therefore missing format-specific concerns like Delta column - // mapping. Dispatch order: explicit `scanImpl == SCAN_NATIVE_DATAFUSION` (line above), - // then this contrib-marker case, then the generic `isCometScan` catch. - case scan: CometScanExec - if CometExtensionRegistry.nativeParquetScanImpls.contains(scan.scanImpl) => - val handler = CometExtensionRegistry.serdeExtensions.iterator - .flatMap(_.matchOperator(scan)) - .nextOption() - .map(_.asInstanceOf[CometOperatorSerde[SparkPlan]]) - handler match { - case Some(h) => convertToComet(scan, h).getOrElse(scan) - // Fall back to the wrapper if no contrib claims the marker -- preserves the - // current behaviour when a build bundles `nativeParquetScanImpls` but no matching - // matchOperator (shouldn't happen in practice but is the safe default). - case None => convertToComet(scan, CometScanWrapper).getOrElse(scan) - } - // Comet JVM + native scan for V1 and V2 case op if isCometScan(op) => convertToComet(op, CometScanWrapper).getOrElse(op) From 4ee71022f98a77abec314e004c4d9499425c9e2e Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Fri, 15 May 2026 16:29:05 -0400 Subject: [PATCH 24/27] feat(contrib): restore matchOperator dispatch for contrib markers Re-instate the marker-dispatch case reverted in 35f1b3bb, with a tighter charter: When a `CometScanExec` is tagged with a contrib's `scanImpl` (any string listed in the contrib's `nativeParquetScanImpls`), route it through that contrib's `matchOperator` serde rather than the generic `CometScanWrapper`. The contrib chooses per-scan whether to claim it (returning `Some`) or defer to the wrapper (returning `None`). If `matchOperator` returns `None`, OR if the chosen serde's `convertToComet` returns `None`, the marker falls back to the generic wrapper -- so the contrib retains full control over when the heavier native conversion fires. The previous revert was driven by a downstream bug: when this dispatch unconditionally claimed every marker, the Delta contrib's `CometDeltaNativeScan.serialize` returned 0 rows for simple streaming-source reads, regressing ~525 tests. With the new design that bug is gated behind the contrib's `matchOperator` -- contribs that aren't ready for the full conversion just return `None` until their `serialize` handles every case. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/comet/rules/CometExecRule.scala | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index 9ffd94a635..1a4a916f8f 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -271,6 +271,27 @@ case class CometExecRule(session: SparkSession) case scan: CometBatchScanExec if scan.wrapped.scan.isInstanceOf[CSVScan] => convertToComet(scan, CometCsvNativeScanExec).getOrElse(scan) + // Contrib marker dispatch: a `CometScanExec` tagged with a contrib's `scanImpl` + // (i.e. listed in `nativeParquetScanImpls`) is routed through that contrib's + // `matchOperator` serde rather than the generic `CometScanWrapper` below. Without + // this, the marker would only get JVM-side parquet bytes-reuse, never reaching the + // contrib's `serialize` and therefore missing format-specific concerns -- e.g. + // Delta column-mapping physical-name substitution and `InputFileBlockHolder` + // population for `input_file_name()` / `_metadata.file_path`. If the contrib's + // `matchOperator` chooses to return None (e.g. the contrib only wants the full + // native conversion for certain scans), the marker falls back to the generic + // wrapper path -- which is also what happens when conversion itself returns None. + case scan: CometScanExec + if CometExtensionRegistry.nativeParquetScanImpls.contains(scan.scanImpl) => + val handler = CometExtensionRegistry.serdeExtensions.iterator + .flatMap(_.matchOperator(scan)) + .nextOption() + .map(_.asInstanceOf[CometOperatorSerde[SparkPlan]]) + handler match { + case Some(h) => convertToComet(scan, h).getOrElse(scan) + case None => convertToComet(scan, CometScanWrapper).getOrElse(scan) + } + // Comet JVM + native scan for V1 and V2 case op if isCometScan(op) => convertToComet(op, CometScanWrapper).getOrElse(op) From 04b48c278c6e30f304fa7e6b9279f231755f8a5a Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Fri, 15 May 2026 21:24:24 -0400 Subject: [PATCH 25/27] feat(contrib): make PlanDataInjector contrib-registrable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `PlanDataInjector` previously had a hardcoded list of built-in injectors with a `// Future: DeltaPlanDataInjector, HudiPlanDataInjector, etc.` comment. That meant a Delta-contrib (PR2) could not actually plug its per-partition proto-injection logic into the execution path: tasks serialized via `perPartitionByKey` never got merged back into the operator tree at `CometExecRDD.compute` time, so the native side decoded `DeltaScan` with an empty `tasks` list and returned `EmptyExec` (0 rows) for any non-empty Delta scan that took the native conversion path. Promote the injector list to a built-in seq + a registerable contrib seq, and expose `registerInjector` / `clearContribInjectors` on the singleton. Same pattern as the `CometExecRDD.PartitionMetadataHandler` SPI added in e4172114: contribs register their injector from `CometOperatorSerdeExtension.init`, `CometExtensionRegistry.resetForTesting` clears the registry alongside other contrib state for test isolation. Visibility of the `PlanDataInjector` object had to be widened from `private[comet]` to package-default (public) so the registry-reset call site in `org.apache.comet.spi.CometExtensionRegistry` can reach the `clearContribInjectors` method. The trait stays `private[comet]` (and so does the rest of the implementation) — contribs in `org.apache.comet.contrib.*` can still see it via the subpackage rule. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../comet/spi/CometExtensionRegistry.scala | 1 + .../apache/spark/sql/comet/operators.scala | 33 +++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala index aec3796bf6..e8ab6fc2f6 100644 --- a/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala +++ b/spark/src/main/scala/org/apache/comet/spi/CometExtensionRegistry.scala @@ -194,6 +194,7 @@ object CometExtensionRegistry extends Logging { // next `load()` re-registers from scratch. Without this the test that exercises // `resetForTesting` + `load` would accumulate handlers across reset boundaries. org.apache.spark.sql.comet.CometExecRDD.clearPartitionMetadataHandlers() + org.apache.spark.sql.comet.PlanDataInjector.clearContribInjectors() } private def loadOne[T](label: String)(implicit ct: scala.reflect.ClassTag[T]): Seq[T] = { diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index f315aae6e2..b9024922bb 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -79,14 +79,35 @@ private[comet] trait PlanDataInjector { /** * Registry and utilities for injecting per-partition planning data into operator trees. */ -private[comet] object PlanDataInjector { +object PlanDataInjector { - // Registry of injectors for different operator types - private val injectors: Seq[PlanDataInjector] = Seq( + // Built-in injectors for core operator types. Contribs add to this via + // `registerInjector` from their `CometOperatorSerdeExtension.init` -- the + // generic SPI route -- so core stays format-agnostic. + private val builtinInjectors: Seq[PlanDataInjector] = Seq( IcebergPlanDataInjector, - NativeScanPlanDataInjector - // Future: DeltaPlanDataInjector, HudiPlanDataInjector, etc. - ) + NativeScanPlanDataInjector) + + @volatile private var contribInjectors: Vector[PlanDataInjector] = Vector.empty + + /** + * SPI: register a contrib-side `PlanDataInjector`. Called once per contrib at + * extension-load time (from `CometOperatorSerdeExtension.init`). Registration is + * idempotent on the same instance but not de-duplicated across structurally-equal + * implementations -- contribs are expected to register exactly once. + */ + def registerInjector(injector: PlanDataInjector): Unit = synchronized { + if (!contribInjectors.contains(injector)) { + contribInjectors = contribInjectors :+ injector + } + } + + /** Test-only reset, mirroring `CometExtensionRegistry.resetForTesting`. */ + def clearContribInjectors(): Unit = synchronized { + contribInjectors = Vector.empty + } + + private def injectors: Seq[PlanDataInjector] = builtinInjectors ++ contribInjectors /** * Injects planning data into an Operator tree by finding nodes that need injection and applying From f08ac86a217def0557595e82b8d0f2c404935c53 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Fri, 15 May 2026 21:48:59 -0400 Subject: [PATCH 26/27] fix(contrib): treat any leaf CometNativeExec as a foreachUntilCometInput boundary `foreachUntilCometInput` enumerates known Comet input-class types (`CometNativeScanExec`, `CometScanExec`, etc.) and recurses past everything else. When a contrib-defined leaf native exec (e.g. the Delta contrib's `CometDeltaNativeScanExec`) appeared in the plan, it matched the generic `case _: CometPlan` arm and recursed into its empty children list without ever invoking `func`. The caller then saw an empty `sparkPlans` buffer and crashed on `firstNonBroadcastPlan.get` -- `None.get` at operators.scala:534. Add a case before the `_: CometPlan` recurse arm: any `CometNativeExec` with zero children is a Comet input. The explicit list above still wins for the known types (preserves existing behaviour exactly), and contribs' leaf scans now participate without needing core-side class enumeration. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../main/scala/org/apache/spark/sql/comet/operators.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index b9024922bb..f4f49a5d51 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -638,6 +638,13 @@ abstract class CometNativeExec extends CometExec { _: CometBroadcastExchangeExec | _: BroadcastQueryStageExec | _: CometSparkToColumnarExec | _: CometLocalTableScanExec => func(plan) + // Any other leaf `CometNativeExec` (e.g. a contrib-defined leaf scan such as the Delta + // contrib's `CometDeltaNativeScanExec`) is a Comet input boundary -- recursing into its + // (non-existent) children would otherwise leave it invisible to the caller, which then + // misinterprets a leaf-only plan as having no inputs at all and crashes on + // `firstNonBroadcastPlan.get`. Treat it the same as the explicit list above. + case p: CometNativeExec if p.children.isEmpty => + func(plan) case _: CometPlan => // Other Comet operators, continue to traverse the tree. plan.children.foreach(foreachUntilCometInput(_)(func)) From 272ada160e093239c409c4767900edfd2c27f37c Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Sat, 16 May 2026 09:00:48 -0400 Subject: [PATCH 27/27] feat(contrib): PlanDataSource trait for contrib leaf scans `findAllPlanData` previously enumerated specific core scan classes (`CometNativeScanExec`, `CometIcebergNativeScanExec`) and fell through the catch-all for everything else. A contrib leaf scan (e.g. the Delta contrib's `CometDeltaNativeScanExec`) would land in the catch-all, recurse into its empty children list, and contribute nothing to the `(commonByKey, perPartitionByKey)` maps that `CometExecRDD.compute` hands to `PlanDataInjector.injectPlanData`. Result: the contrib's injector was registered but never called, the proto's per-partition `tasks` stayed empty, and the native side returned `EmptyExec` -> 0 rows for every contrib scan that lived under a non-leaf Comet operator. Introduce a `PlanDataSource` trait (sourceKey, commonBytes, perPartitionBytes, optional subquery-resolution hook). `findAllPlanData` matches the trait first so contribs can extend it without core knowing the concrete class. Core's existing scans (`CometNativeScanExec`, `CometIcebergNativeScanExec`) keep their explicit arms for now; mixing them into the trait is a cleanup that can land separately without changing behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../apache/spark/sql/comet/operators.scala | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index f4f49a5d51..bf0b414e86 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -59,6 +59,34 @@ import org.apache.comet.serde.OperatorOuterClass.{AggregateMode => CometAggregat import org.apache.comet.serde.QueryPlanSerde.{aggExprToProto, exprToProto, isStringCollationType, supportedSortType} import org.apache.comet.serde.operator.CometSink +/** + * Generic source of per-partition planning data for a Comet native exec. Implementations expose + * the trio of inputs that `CometExecRDD.compute` needs to feed `PlanDataInjector.injectPlanData`: + * + * - `planDataSourceKey`: stable identifier the matching `PlanDataInjector.getKey` reproduces + * by hashing the operator's common payload. Must agree between driver-side (RDD construction) + * and executor-side (injector lookup) views of the SAME operator's proto. + * - `planDataCommonBytes`: serialized common block (schemas, table root, filters, ...) the + * contrib's `serialize` produced once per scan. + * - `planDataPerPartitionBytes`: array of serialized per-partition payloads, one entry per + * partition, carrying that partition's task list / file list / ranges. + * + * `findAllPlanData` checks for this trait BEFORE the hardcoded `CometNativeScanExec` / + * `CometIcebergNativeScanExec` arms so contribs can plug in without core-side enumeration. + * Core's own scans implement the trait too for symmetry (no behavioural change -- the trait's + * defaults just delegate to their existing accessors). + * + * Implementations whose driver-side `commonData` / `perPartitionData` require Spark's standard + * `prepare -> waitForSubqueries` lifecycle (typically because DPP `InSubqueryExec` values land + * in the per-partition payload) override `ensureSubqueriesResolvedIfApplicable` to trigger it. + */ +trait PlanDataSource { self: SparkPlan => + def planDataSourceKey: String + def planDataCommonBytes: Array[Byte] + def planDataPerPartitionBytes: Array[Array[Byte]] + def ensureSubqueriesResolvedIfApplicable(): Unit = () +} + /** * Trait for injecting per-partition planning data into operator nodes. * @@ -692,6 +720,16 @@ abstract class CometNativeExec extends CometExec { private def findAllPlanData( plan: SparkPlan): (Map[String, Array[Byte]], Map[String, Array[Array[Byte]]]) = { plan match { + // Contribs (e.g. the Delta contrib's `CometDeltaNativeScanExec`) implement + // `PlanDataSource` to expose their per-partition payload and matching + // `sourceKey`. Checked before the explicit core cases below so subclasses can + // override the trait without colliding with the hardcoded matches. + case src: PlanDataSource => + src.ensureSubqueriesResolvedIfApplicable() + ( + Map(src.planDataSourceKey -> src.planDataCommonBytes), + Map(src.planDataSourceKey -> src.planDataPerPartitionBytes)) + case iceberg: CometIcebergNativeScanExec => // Trigger Spark's standard prepare -> waitForSubqueries lifecycle so DPP // InSubqueryExec values are resolved before commonData is read. Without this,