From 28fb854f0d810dc9413f0b51c51bbd6a13b513bb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 23 May 2026 09:11:07 -0600 Subject: [PATCH 01/20] fix(codegen): use isNullAt on CometPlainVector-wrapped columns in null short-circuit CometBatchKernelCodegen.defaultBody emitted this.col$ord.isNull(i) for every NullIntolerant input, but primitive Arrow vectors (timestamp / int / float / date / bool / ...) are wrapped in CometPlainVector at input-cast time and expose isNullAt rather than the raw Arrow isNull. The short-circuit therefore failed to compile for any primitive-typed column with a Janino "method isNull not declared" error. Share the existing nullCheckMethod helper between emitTypedGetters and defaultBody so both sites pick the right method name per column. Add a source test that pins the chosen method for TimeStampMicroTZVector inputs. --- .../codegen/CometBatchKernelCodegen.scala | 13 +++++++++-- .../CometBatchKernelCodegenInput.scala | 4 +++- .../comet/CometCodegenSourceSuite.scala | 23 ++++++++++++++++++- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala index 2795911da3..042fd9ced3 100644 --- a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala +++ b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala @@ -261,7 +261,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim { val subExprsCode = ctx.subexprFunctionsCode val (cls, setup, snippet) = CometBatchKernelCodegenOutput.emitOutputWriter(boundExpr.dataType, ev.value, ctx) - (cls, setup, defaultBody(boundExpr, ev, snippet, subExprsCode)) + (cls, setup, defaultBody(boundExpr, inputSchema, ev, snippet, subExprsCode)) } val typedFieldDecls = CometBatchKernelCodegenInput.emitInputFieldDecls(inputSchema) @@ -343,6 +343,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim { */ private def defaultBody( boundExpr: Expression, + inputSchema: Seq[ArrowColumnSpec], ev: ExprCode, writeSnippet: String, subExprsCode: String): String = { @@ -353,9 +354,17 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim { // make this incorrect (`coalesce(null, x)` is `x`); `allNullIntolerant` rejects those. val inputOrdinals = boundExpr.collect { case b: BoundReference => b.ordinal }.distinct + // Primitive Arrow vectors are wrapped in `CometPlainVector` at input-cast time, which + // exposes `isNullAt(int)` rather than the raw Arrow `isNull(int)`. Pick the right method + // per ordinal so the short-circuit compiles for timestamp / int / float columns too, + // not just VarChar / Decimal vectors that stay as raw Arrow types. + def nullCheckCall(ord: Int): String = { + val method = CometBatchKernelCodegenInput.nullCheckMethod(inputSchema(ord)) + s"this.col$ord.$method(i)" + } val nullCheck = if (inputOrdinals.isEmpty) "false" - else inputOrdinals.map(ord => s"this.col$ord.isNull(i)").mkString(" || ") + else inputOrdinals.map(nullCheckCall).mkString(" || ") s""" |if ($nullCheck) { | output.setNull(i); diff --git a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala index 79a2af6837..74e4881de0 100644 --- a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala +++ b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala @@ -404,8 +404,10 @@ private[codegen] object CometBatchKernelCodegenInput { /** * Java method name for the per-column null check. Primitive scalars wrapped in * [[CometPlainVector]] expose `isNullAt`; Arrow typed fields expose `isNull`. Same semantics. + * Used both by `emitTypedGetters` (for the kernel's `isNullAt` switch) and by + * `CometBatchKernelCodegen.defaultBody` (for the `NullIntolerant` short-circuit). */ - private def nullCheckMethod(spec: ArrowColumnSpec): String = spec match { + def nullCheckMethod(spec: ArrowColumnSpec): String = spec match { case sc: ScalarColumnSpec if wrapsInCometPlainVector(sc.vectorClass) => "isNullAt" case _ => "isNull" } diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala index 27a5830c6d..274b70bce1 100644 --- a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala @@ -22,9 +22,10 @@ package org.apache.comet import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Add, BoundReference, Coalesce, Concat, CreateArray, CreateMap, ElementAt, Expression, GetStructField, LeafExpression, Length, Literal, Nondeterministic, Rand, Size, Unevaluable, Upper} +import org.apache.spark.sql.catalyst.expressions.{Add, BoundReference, Coalesce, Concat, CreateArray, CreateMap, DateFormatClass, ElementAt, Expression, GetStructField, LeafExpression, Length, Literal, Nondeterministic, Rand, Size, Unevaluable, Upper} import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, CodegenFallback, ExprCode} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String import org.apache.comet.codegen.CometBatchKernelCodegen import org.apache.comet.codegen.CometBatchKernelCodegen.{ArrayColumnSpec, ArrowColumnSpec, MapColumnSpec, ScalarColumnSpec, StructColumnSpec, StructFieldSpec} @@ -61,6 +62,26 @@ class CometCodegenSourceSuite extends AnyFunSuite { specs: ArrowColumnSpec*): String = CometBatchKernelCodegen.generateSource(expr, specs.toIndexedSeq).body + test("NullIntolerant short-circuit uses isNullAt for CometPlainVector-wrapped columns") { + // Primitive Arrow vectors (timestamp / int / float / ...) are wrapped in `CometPlainVector` + // at input-cast time. The short-circuit must call `isNullAt(i)`, not `isNull(i)`, otherwise + // Janino fails to compile the kernel with "method isNull not declared". Verified end-to-end + // by `CometTemporalExpressionSuite` date_format tests over `TimeStampMicroTZVector` inputs. + val tsVec = CometBatchKernelCodegen.vectorClassBySimpleName("TimeStampMicroTZVector") + val spec = ArrowColumnSpec(tsVec, nullable = true) + val expr = DateFormatClass( + BoundReference(0, TimestampType, nullable = true), + Literal(UTF8String.fromString("yyyy-MM-dd EEEE"), StringType), + Some("UTC")) + val src = CometBatchKernelCodegen.generateSource(expr, IndexedSeq(spec)).body + assert( + src.contains("if (this.col0.isNullAt(i))"), + s"expected short-circuit to use isNullAt for CometPlainVector-wrapped col0; got:\n$src") + assert( + !src.contains("if (this.col0.isNull(i))"), + s"expected no raw Arrow isNull on the CometPlainVector-wrapped col0; got:\n$src") + } + test("non-nullable column emits literal-false isNullAt case") { val expr = Length(BoundReference(0, StringType, nullable = false)) val src = gen(expr, nonNullableString) From d344ec0e6cc3e3feee01e8c90a74f5a6d42a4e46 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 23 May 2026 09:11:31 -0600 Subject: [PATCH 02/20] feat: route date_format JVM fallback through codegen dispatcher CometDateFormat keeps the native to_char path for UTC sessions with a format literal in the strftime-mappable whitelist, and now routes every other case through the Arrow-direct codegen dispatcher (CometScalaUDFCodegen) so that non-UTC sessions, non-literal formats, and formats outside the whitelist stay inside the Comet pipeline running Spark's own DateFormatClass.doGenCode. Refactor: extract the closure-serialize + JvmScalarUdf-proto emission from CometScalaUDF.convert into a reusable CometScalaUDF.emitJvmCodegenDispatch helper. Any serde that wants to fall back to a Spark built-in expression through the dispatcher can call it. Gated by COMET_SCALA_UDF_CODEGEN_ENABLED so the default remains a clean Spark fallback for those cases until the dispatcher graduates from experimental. Reasoning notes: - DateFormatClass already has a proper doGenCode (not CodegenFallback), NullIntolerant, and ResolveTimeZone stamps the timeZoneId on it during analysis. Closure-serializing the bound tree therefore reproduces Spark-identical behavior for every timezone. - The kernel cache key already encodes the literal format and timezone via the serialized expression bytes, so (format, tz) combinations get distinct cached kernels just like a bespoke (format, tz) -> formatter cache would. Saves an entire DateFormatUDF.scala class. Tests: - date_format - timestamp_ntz input: now runs checkSparkAnswerAndOperator for every timezone under the codegen flag instead of falling back for non-UTC. - Split each previous "falls back to Spark" Scala test into two: one asserting the codegen-on path stays in Comet, one asserting the codegen-off path falls back with the dispatcher flag as the reason. - date_format.sql now pins a non-UTC session timezone and enables the codegen flag at file scope; all queries are plain query and assert in-Comet execution. --- .../apache/comet/serde/CometScalaUDF.scala | 28 ++++- .../org/apache/comet/serde/datetime.scala | 103 ++++++++---------- .../expressions/datetime/date_format.sql | 14 ++- .../comet/CometTemporalExpressionSuite.scala | 76 ++++++++----- 4 files changed, 132 insertions(+), 89 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala index bf636f7221..010e3dd402 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala @@ -20,7 +20,7 @@ package org.apache.comet.serde import org.apache.spark.SparkEnv -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSeq, BindReferences, Literal, ScalaUDF} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSeq, BindReferences, Expression, Literal, ScalaUDF} import org.apache.spark.sql.types.BinaryType import org.apache.comet.CometConf @@ -45,15 +45,35 @@ import org.apache.comet.udf.codegen.CometScalaUDFCodegen * * Gated by [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]]. When disabled, plans containing a * `ScalaUDF` fall back to Spark for the enclosing operator. + * + * [[emitJvmCodegenDispatch]] exposes the same closure-serialize + dispatcher-proto path to other + * serdes that want to keep a built-in Spark expression inside the Comet pipeline when no native + * lowering is viable. See [[CometDateFormat]] for an example. */ object CometScalaUDF extends CometExpressionSerde[ScalaUDF] { - override def convert(expr: ScalaUDF, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { + override def convert(expr: ScalaUDF, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = + emitJvmCodegenDispatch(expr, inputs, binding) + + /** + * Bind `expr`, closure-serialize it, and emit a `JvmScalarUdf` proto routed through + * [[CometScalaUDFCodegen]] so that native execution evaluates the expression inside the + * Arrow-direct codegen dispatcher. The dispatcher will Janino-compile `expr.doGenCode` into a + * batch kernel on first invocation per task. + * + * Returns `None` (with `withInfo` tagging the reason) when the dispatcher is disabled via + * [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]] or when [[CometBatchKernelCodegen.canHandle]] + * refuses the expression tree. Callers should treat `None` as a clean Spark-fallback signal. + */ + def emitJvmCodegenDispatch( + expr: Expression, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = { if (!CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.get()) { withInfo( expr, - s"${CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key}=false; ScalaUDF has no native path " + - "so the plan falls back to Spark") + s"${CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key}=false; expression has no native " + + "path so the plan falls back to Spark") return None } diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index b57b1e4e56..24ca862fb9 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String +import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.CometGetDateField.CometGetDateField @@ -593,17 +594,23 @@ object CometTruncTimestamp extends CometExpressionSerde[TruncTimestamp] { } /** - * Converts Spark DateFormatClass expression to DataFusion's to_char function. + * Converts Spark `DateFormatClass` to DataFusion's `to_char` when format and timezone are + * mappable, otherwise routes the expression through the Arrow-direct codegen dispatcher so that + * Spark's own `DateFormatClass.doGenCode` runs inside the Comet pipeline. * - * Spark uses Java SimpleDateFormat patterns while DataFusion uses strftime patterns. This - * implementation supports a whitelist of common format strings that can be reliably mapped - * between the two systems. + * Routing: + * - format is a literal in `supportedFormats` AND timezone is UTC -> native `to_char` + * - format is a literal in `supportedFormats` AND timezone is non-UTC, with the per-expression + * `allowIncompatible` flag set -> native `to_char` (results may differ from Spark) + * - all other cases -> JVM codegen dispatcher ([[CometScalaUDF.emitJvmCodegenDispatch]]), gated + * by [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]]. When that flag is disabled the operator + * falls back to Spark. */ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { /** * Mapping from Spark SimpleDateFormat patterns to strftime patterns. Only formats in this map - * are supported. + * are supported by the native path. */ val supportedFormats: Map[String, String] = Map( // Full date formats @@ -637,66 +644,50 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { // ISO formats "yyyy-MM-dd'T'HH:mm:ss" -> "%Y-%m-%dT%H:%M:%S") - override def getIncompatibleReasons(): Seq[String] = Seq( - "Non-UTC timezones may produce different results than Spark") + // Compatibility is decided inside `convert`: the native path covers a subset, and the codegen + // dispatcher covers everything else when enabled. Plan-time tagging happens via `withInfo` on + // the path that returns None. + override def getSupportLevel(expr: DateFormatClass): SupportLevel = Compatible() - override def getUnsupportedReasons(): Seq[String] = Seq( - "Only the following formats are supported:" + - supportedFormats.keys.toSeq.sorted - .map(k => s"`$k`") - .mkString("\n - ", "\n - ", "")) - - override def getSupportLevel(expr: DateFormatClass): SupportLevel = { - // Check timezone - only UTC is fully compatible - val timezone = expr.timeZoneId.getOrElse("UTC") - val isUtc = timezone == "UTC" || timezone == "Etc/UTC" - - expr.right match { - case Literal(fmt: UTF8String, _) => - val format = fmt.toString - if (supportedFormats.contains(format)) { - if (isUtc) { - Compatible() - } else { - Incompatible(Some(s"Non-UTC timezone '$timezone' may produce different results")) - } - } else { - Unsupported( - Some( - s"Format '$format' is not supported. Supported formats: " + - supportedFormats.keys.mkString(", "))) - } - case _ => - Unsupported(Some("Only literal format strings are supported")) - } - } + override def getCompatibleNotes(): Seq[String] = Seq( + "Format strings in a curated allow-list run natively via DataFusion's `to_char` for UTC " + + "sessions. Other format strings (including non-literal formats), as well as non-UTC " + + "sessions, route through Spark's own `DateFormatClass.doGenCode` via the Arrow-direct " + + "codegen dispatcher when `spark.comet.exec.scalaUDF.codegen.enabled=true`. When the " + + "codegen dispatcher is disabled (default) the operator falls back to Spark in those " + + "cases.") override def convert( expr: DateFormatClass, inputs: Seq[Attribute], binding: Boolean): Option[ExprOuterClass.Expr] = { - // Get the format string - must be a literal for us to map it - val strftimeFormat = expr.right match { - case Literal(fmt: UTF8String, _) => - supportedFormats.get(fmt.toString) + val timezone = expr.timeZoneId.getOrElse("UTC") + val isUtc = timezone == "UTC" || timezone == "Etc/UTC" + + val nativeFormat: Option[String] = expr.right match { + case Literal(fmt: UTF8String, _) => supportedFormats.get(fmt.toString) case _ => None } - strftimeFormat match { - case Some(format) => - val childExpr = exprToProtoInternal(expr.left, inputs, binding) - val formatExpr = exprToProtoInternal(Literal(format), inputs, binding) - - val optExpr = scalarFunctionExprToProtoWithReturnType( - "to_char", - StringType, - false, - childExpr, - formatExpr) - optExprWithInfo(optExpr, expr, expr.left, expr.right) - case None => - withInfo(expr, expr.left, expr.right) - None + val canUseNative = nativeFormat.isDefined && { + isUtc || CometConf.isExprAllowIncompat(getExprConfigName(expr)) + } + + if (canUseNative) { + val childExpr = exprToProtoInternal(expr.left, inputs, binding) + val formatExpr = exprToProtoInternal(Literal(nativeFormat.get), inputs, binding) + val optExpr = scalarFunctionExprToProtoWithReturnType( + "to_char", + StringType, + false, + childExpr, + formatExpr) + optExprWithInfo(optExpr, expr, expr.left, expr.right) + } else { + // Hand the full `DateFormatClass` (with `timeZoneId` already stamped by `ResolveTimeZone`) + // to the codegen dispatcher. It closure-serializes the bound tree, so non-UTC timezones + // and non-whitelisted / non-literal format strings produce Spark-identical results. + CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding) } } } diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_format.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_format.sql index 09333f44d3..dec690cb6a 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/date_format.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_format.sql @@ -15,21 +15,27 @@ -- specific language governing permissions and limitations -- under the License. +-- Pin the session timezone so the test exercises the non-UTC path regardless of the JVM +-- default. Enable the codegen dispatcher so non-UTC and non-whitelisted formats stay inside +-- Comet via Spark's own DateFormatClass.doGenCode instead of falling back to Spark. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + statement CREATE TABLE test_date_format(ts timestamp) USING parquet statement INSERT INTO test_date_format VALUES (timestamp('2024-06-15 10:30:45')), (timestamp('1970-01-01 00:00:00')), (NULL) -query expect_fallback(Non-UTC timezone) +query SELECT date_format(ts, 'yyyy-MM-dd') FROM test_date_format -query expect_fallback(Non-UTC timezone) +query SELECT date_format(ts, 'HH:mm:ss') FROM test_date_format -query expect_fallback(Non-UTC timezone) +query SELECT date_format(ts, 'yyyy-MM-dd HH:mm:ss') FROM test_date_format -- literal arguments -query expect_fallback(Non-UTC timezone) +query SELECT date_format(timestamp('2024-06-15 10:30:45'), 'yyyy-MM-dd') diff --git a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala index a8147089d9..20ad90a91c 100644 --- a/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometTemporalExpressionSuite.scala @@ -214,26 +214,21 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH } test("date_format - timestamp_ntz input") { - // TimestampNTZ is timezone-independent, so date_format should produce the same - // formatted string regardless of session timezone. Comet currently only runs this - // natively for UTC; for non-UTC it falls back to Spark. We verify correctness - // (matching Spark's output) in all cases. + // TimestampNTZ is timezone-independent, so date_format must produce the same string + // regardless of session timezone. With the codegen dispatcher enabled, non-UTC sessions + // stay in Comet by running Spark's own `DateFormatClass.doGenCode` via the dispatcher. val r = new Random(42) val ntzSchema = StructType(Seq(StructField("ts_ntz", DataTypes.TimestampNTZType, true))) val ntzDF = FuzzDataGenerator.generateDataFrame(r, spark, ntzSchema, 100, DataGenOptions()) ntzDF.createOrReplaceTempView("ntz_tbl") val supportedFormats = CometDateFormat.supportedFormats.keys.toSeq.filterNot(_.contains("'")) - for (tz <- crossTimezones) { - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) { - for (format <- supportedFormats) { - if (tz == "UTC") { + withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "true") { + for (tz <- crossTimezones) { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) { + for (format <- supportedFormats) { checkSparkAnswerAndOperator( s"SELECT ts_ntz, date_format(ts_ntz, '$format') from ntz_tbl order by ts_ntz") - } else { - // Non-UTC falls back to Spark but should still produce correct results - checkSparkAnswer( - s"SELECT ts_ntz, date_format(ts_ntz, '$format') from ntz_tbl order by ts_ntz") } } } @@ -476,34 +471,62 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH } } - test("date_format unsupported format falls back to Spark") { + test("date_format unsupported format routes via codegen dispatcher") { createTimestampTestData.createOrReplaceTempView("tbl") - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { - // Unsupported format string + withSQLConf( + SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC", + CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "true") { + checkSparkAnswerAndOperator( + "SELECT c0, date_format(c0, 'yyyy-MM-dd EEEE') from tbl order by c0") + } + } + + test("date_format unsupported format falls back when codegen dispatcher disabled") { + createTimestampTestData.createOrReplaceTempView("tbl") + + withSQLConf( + SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC", + CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") { checkSparkAnswerAndFallbackReason( "SELECT c0, date_format(c0, 'yyyy-MM-dd EEEE') from tbl order by c0", - "Format 'yyyy-MM-dd EEEE' is not supported") + CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key) } } - test("date_format with non-UTC timezone falls back to Spark") { + test("date_format with non-UTC timezone routes via codegen dispatcher") { createTimestampTestData.createOrReplaceTempView("tbl") val nonUtcTimezones = Seq("America/New_York", "America/Los_Angeles", "Europe/London", "Asia/Tokyo") for (tz <- nonUtcTimezones) { - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz) { - // Non-UTC timezones should fall back to Spark as Incompatible + withSQLConf( + SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz, + CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "true") { + checkSparkAnswerAndOperator( + "SELECT c0, date_format(c0, 'yyyy-MM-dd HH:mm:ss') from tbl order by c0") + } + } + } + + test("date_format with non-UTC timezone falls back when codegen dispatcher disabled") { + createTimestampTestData.createOrReplaceTempView("tbl") + + val nonUtcTimezones = Seq("America/New_York", "Europe/London") + + for (tz <- nonUtcTimezones) { + withSQLConf( + SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz, + CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") { checkSparkAnswerAndFallbackReason( "SELECT c0, date_format(c0, 'yyyy-MM-dd HH:mm:ss') from tbl order by c0", - s"Non-UTC timezone '$tz' may produce different results") + CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key) } } } - test("date_format with non-UTC timezone works when allowIncompatible is enabled") { + test("date_format with non-UTC timezone takes native path when allowIncompatible is enabled") { createTimestampTestData.createOrReplaceTempView("tbl") val nonUtcTimezones = Seq("America/New_York", "Europe/London", "Asia/Tokyo") @@ -511,10 +534,13 @@ class CometTemporalExpressionSuite extends CometTestBase with AdaptiveSparkPlanH for (tz <- nonUtcTimezones) { withSQLConf( SQLConf.SESSION_LOCAL_TIMEZONE.key -> tz, - "spark.comet.expr.DateFormatClass.allowIncompatible" -> "true") { - // With allowIncompatible enabled, Comet will execute the expression - // Results may differ from Spark but should not throw errors - checkSparkAnswer("SELECT c0, date_format(c0, 'yyyy-MM-dd') from tbl order by c0") + "spark.comet.expression.DateFormatClass.allowIncompatible" -> "true") { + // Native to_char results may diverge from Spark for non-UTC timezones (the reason the + // JVM UDF is the default), so we only check that execution stays inside Comet. ORDER BY + // is omitted to keep the plan free of AQEShuffleRead. + val df = sql("SELECT c0, date_format(c0, 'yyyy-MM-dd') from tbl") + df.collect() + checkCometOperators(stripAQEPlan(df.queryExecution.executedPlan)) } } } From 7cbb8e4b4dc8364afd7e5a1bb1e3c729c2206192 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 07:52:09 -0600 Subject: [PATCH 03/20] test: update ArrayInsertUnsupportedArgs fallback reason wording The CometScalaUDF fallback message was generalized from 'ScalaUDF has no native path' to 'expression has no native path' when the dispatcher helper was extracted for reuse by CometDateFormat. --- .../test/scala/org/apache/comet/CometArrayExpressionSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala index 7591279786..797020aed0 100644 --- a/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala @@ -247,7 +247,7 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp .withColumn("arrUnsupportedArgs", expr("array_insert(arr, idx, 1)")) checkSparkAnswerAndFallbackReasons( df.select("arrUnsupportedArgs"), - Set("ScalaUDF has no native path", "unsupported arguments for ArrayInsert")) + Set("expression has no native path", "unsupported arguments for ArrayInsert")) } } } From 7f781e1f7e7adf370ceb005f70d3b32611494cd2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 08:17:54 -0600 Subject: [PATCH 04/20] feat(serde): add CometCodegenDispatch helper for codegen-routed expressions --- .../apache/comet/serde/CometScalaUDF.scala | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala index 010e3dd402..0e1ca86fa2 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala @@ -120,3 +120,23 @@ object CometScalaUDF extends CometExpressionSerde[ScalaUDF] { .build()) } } + +/** + * Convenience base for serdes that route a non-ScalaUDF Spark expression through the codegen + * dispatcher. Delegates `convert` to [[CometScalaUDF.emitJvmCodegenDispatch]] and marks the + * expression `Compatible()` because the dispatcher runs Spark's own `doGenCode` inside the + * kernel: behavior matches Spark exactly when + * [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]] is enabled, and the operator falls back to + * Spark cleanly when it is not. + */ +class CometCodegenDispatch[T <: Expression] extends CometExpressionSerde[T] { + override def getSupportLevel(expr: T): SupportLevel = Compatible() + override def getCompatibleNotes(): Seq[String] = Seq( + "Runs via the Arrow-direct codegen dispatcher when " + + "spark.comet.exec.scalaUDF.codegen.enabled=true. Default behavior falls back to Spark.") + override def convert( + expr: T, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = + CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding) +} From 34de47c241d69182cca21f1f6300f7f36d2e7d35 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 08:30:50 -0600 Subject: [PATCH 05/20] feat(datetime): route AddMonths through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/add_months.sql | 43 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 8d48239e76..6ac8393bc6 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -219,6 +219,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { private[comet] val temporalExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + classOf[AddMonths] -> CometAddMonths, classOf[ConvertTimezone] -> CometConvertTimezone, classOf[DateAdd] -> CometDateAdd, classOf[DateDiff] -> CometDateDiff, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 24ca862fb9..865d9aaffd 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -771,3 +771,5 @@ object CometDays extends CometExpressionSerde[Days] { optExprWithInfo(optExpr, expr, expr.child) } } + +object CometAddMonths extends CometCodegenDispatch[AddMonths] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql b/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql new file mode 100644 index 0000000000..47c3957812 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/add_months.sql @@ -0,0 +1,43 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes add_months through the codegen dispatcher. Spark's own AddMonths.doGenCode +-- runs inside the Janino-compiled kernel. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_add_months(d date, n int) USING parquet + +statement +INSERT INTO test_add_months VALUES + (date('2024-01-15'), 1), + (date('2024-01-31'), 1), + (date('2024-12-15'), -13), + (date('1970-01-01'), 0), + (NULL, 1), + (date('2024-06-15'), NULL) + +query +SELECT add_months(d, n) FROM test_add_months + +query +SELECT add_months(d, 12) FROM test_add_months + +-- literal arguments +query +SELECT add_months(date('2024-02-29'), 12) From 15b4193cd5806635eabf40dbcd1213b1442eaaad Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 08:52:51 -0600 Subject: [PATCH 06/20] feat(datetime): route MonthsBetween through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/months_between.sql | 41 +++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/months_between.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 6ac8393bc6..3f03eb190b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -235,6 +235,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[LastDay] -> CometLastDay, classOf[Hour] -> CometHour, classOf[MakeDate] -> CometMakeDate, + classOf[MonthsBetween] -> CometMonthsBetween, classOf[Minute] -> CometMinute, classOf[NextDay] -> CometNextDay, classOf[Second] -> CometSecond, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 865d9aaffd..015154ff03 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -773,3 +773,5 @@ object CometDays extends CometExpressionSerde[Days] { } object CometAddMonths extends CometCodegenDispatch[AddMonths] + +object CometMonthsBetween extends CometCodegenDispatch[MonthsBetween] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/months_between.sql b/spark/src/test/resources/sql-tests/expressions/datetime/months_between.sql new file mode 100644 index 0000000000..6086578013 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/months_between.sql @@ -0,0 +1,41 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes months_between through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_months_between(t1 timestamp, t2 timestamp) USING parquet + +statement +INSERT INTO test_months_between VALUES + (timestamp('2024-06-15 10:30:00'), timestamp('2024-01-15 10:30:00')), + (timestamp('2024-01-31 00:00:00'), timestamp('2024-02-29 00:00:00')), + (timestamp('1970-01-01 00:00:00'), timestamp('1970-01-01 00:00:00')), + (NULL, timestamp('2024-01-01 00:00:00')), + (timestamp('2024-01-01 00:00:00'), NULL) + +query +SELECT months_between(t1, t2) FROM test_months_between + +query +SELECT months_between(t1, t2, false) FROM test_months_between + +-- literal arguments +query +SELECT months_between(timestamp('1997-02-28 10:30:00'), timestamp('1996-10-30 00:00:00')) From 4d352fb3e3fa671a1517af207656d928072db16c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:00:25 -0600 Subject: [PATCH 07/20] feat(datetime): route MakeTimestamp through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/make_timestamp.sql | 42 +++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 3f03eb190b..5fdb3274c7 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -235,6 +235,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[LastDay] -> CometLastDay, classOf[Hour] -> CometHour, classOf[MakeDate] -> CometMakeDate, + classOf[MakeTimestamp] -> CometMakeTimestamp, classOf[MonthsBetween] -> CometMonthsBetween, classOf[Minute] -> CometMinute, classOf[NextDay] -> CometNextDay, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 015154ff03..d3cf4f78c5 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -775,3 +775,5 @@ object CometDays extends CometExpressionSerde[Days] { object CometAddMonths extends CometCodegenDispatch[AddMonths] object CometMonthsBetween extends CometCodegenDispatch[MonthsBetween] + +object CometMakeTimestamp extends CometCodegenDispatch[MakeTimestamp] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp.sql new file mode 100644 index 0000000000..07f4ffd8c8 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp.sql @@ -0,0 +1,42 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes make_timestamp through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_make_timestamp(y int, mo int, d int, h int, mi int, s decimal(8,6)) USING parquet + +statement +INSERT INTO test_make_timestamp VALUES + (2024, 6, 15, 10, 30, 45.123456), + (1970, 1, 1, 0, 0, 0.0), + (2024, 12, 31, 23, 59, 59.999999), + (NULL, 6, 15, 10, 30, 45.0), + (2024, NULL, 15, 10, 30, 45.0) + +query +SELECT make_timestamp(y, mo, d, h, mi, s) FROM test_make_timestamp + +-- Explicit timezone argument +query +SELECT make_timestamp(y, mo, d, h, mi, s, 'UTC') FROM test_make_timestamp + +-- literal arguments +query +SELECT make_timestamp(2024, 6, 15, 10, 30, 45.000) From 6de2d2db1bdc871bee0beb2b629d7eec21920a9d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:13:26 -0600 Subject: [PATCH 08/20] feat(datetime): route MillisToTimestamp through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 ++- .../expressions/datetime/timestamp_millis.sql | 33 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 5fdb3274c7..e556af4c8e 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -236,6 +236,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[Hour] -> CometHour, classOf[MakeDate] -> CometMakeDate, classOf[MakeTimestamp] -> CometMakeTimestamp, + classOf[MillisToTimestamp] -> CometMillisToTimestamp, classOf[MonthsBetween] -> CometMonthsBetween, classOf[Minute] -> CometMinute, classOf[NextDay] -> CometNextDay, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index d3cf4f78c5..72c7571b74 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -777,3 +777,5 @@ object CometAddMonths extends CometCodegenDispatch[AddMonths] object CometMonthsBetween extends CometCodegenDispatch[MonthsBetween] object CometMakeTimestamp extends CometCodegenDispatch[MakeTimestamp] + +object CometMillisToTimestamp extends CometCodegenDispatch[MillisToTimestamp] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis.sql b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis.sql new file mode 100644 index 0000000000..1ede954c9e --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis.sql @@ -0,0 +1,33 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes timestamp_millis through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_timestamp_millis(v long) USING parquet + +statement +INSERT INTO test_timestamp_millis VALUES (0), (1718451045000), (-1), (NULL), (86400000) + +query +SELECT timestamp_millis(v) FROM test_timestamp_millis + +-- literal arguments +query +SELECT timestamp_millis(1718451045000) From d22e355f7d2941afbd3ea15b61397b81ee506f8c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:21:14 -0600 Subject: [PATCH 09/20] feat(datetime): route MicrosToTimestamp through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 ++- .../expressions/datetime/timestamp_micros.sql | 33 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/timestamp_micros.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index e556af4c8e..c1e1e66f7b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -236,6 +236,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[Hour] -> CometHour, classOf[MakeDate] -> CometMakeDate, classOf[MakeTimestamp] -> CometMakeTimestamp, + classOf[MicrosToTimestamp] -> CometMicrosToTimestamp, classOf[MillisToTimestamp] -> CometMillisToTimestamp, classOf[MonthsBetween] -> CometMonthsBetween, classOf[Minute] -> CometMinute, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 72c7571b74..eec0dc8b44 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -778,4 +778,6 @@ object CometMonthsBetween extends CometCodegenDispatch[MonthsBetween] object CometMakeTimestamp extends CometCodegenDispatch[MakeTimestamp] +object CometMicrosToTimestamp extends CometCodegenDispatch[MicrosToTimestamp] + object CometMillisToTimestamp extends CometCodegenDispatch[MillisToTimestamp] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_micros.sql b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_micros.sql new file mode 100644 index 0000000000..253a97f95f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_micros.sql @@ -0,0 +1,33 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes timestamp_micros through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_timestamp_micros(v long) USING parquet + +statement +INSERT INTO test_timestamp_micros VALUES (0), (1718451045000000), (-1), (NULL), (86400000000) + +query +SELECT timestamp_micros(v) FROM test_timestamp_micros + +-- literal arguments +query +SELECT timestamp_micros(1718451045000000) From 2d0a117642e7a1805b45e09b29641ec016ec9d0a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:28:41 -0600 Subject: [PATCH 10/20] feat(datetime): route UnixSeconds through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/unix_seconds.sql | 37 +++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index c1e1e66f7b..8eb218bc82 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -245,6 +245,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[SecondsToTimestamp] -> CometSecondsToTimestamp, classOf[TruncDate] -> CometTruncDate, classOf[TruncTimestamp] -> CometTruncTimestamp, + classOf[UnixSeconds] -> CometUnixSeconds, classOf[UnixTimestamp] -> CometUnixTimestamp, classOf[Year] -> CometYear, classOf[Month] -> CometMonth, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index eec0dc8b44..8cade0e3d1 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -781,3 +781,5 @@ object CometMakeTimestamp extends CometCodegenDispatch[MakeTimestamp] object CometMicrosToTimestamp extends CometCodegenDispatch[MicrosToTimestamp] object CometMillisToTimestamp extends CometCodegenDispatch[MillisToTimestamp] + +object CometUnixSeconds extends CometCodegenDispatch[UnixSeconds] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql new file mode 100644 index 0000000000..a8d5ba92a1 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_seconds.sql @@ -0,0 +1,37 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes unix_seconds through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_unix_seconds(ts timestamp) USING parquet + +statement +INSERT INTO test_unix_seconds VALUES + (timestamp('2024-06-15 10:30:45')), + (timestamp('1970-01-01 00:00:00')), + (timestamp('1969-12-31 23:59:59')), + (NULL) + +query +SELECT unix_seconds(ts) FROM test_unix_seconds + +-- literal argument +query +SELECT unix_seconds(timestamp('2024-06-15 10:30:45.123')) From 2826f15d7cb1d5abfc940e382141a66c5f73ed41 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:33:30 -0600 Subject: [PATCH 11/20] feat(datetime): route UnixMillis through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/unix_millis.sql | 37 +++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 8eb218bc82..8e5c0bd6de 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -245,6 +245,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[SecondsToTimestamp] -> CometSecondsToTimestamp, classOf[TruncDate] -> CometTruncDate, classOf[TruncTimestamp] -> CometTruncTimestamp, + classOf[UnixMillis] -> CometUnixMillis, classOf[UnixSeconds] -> CometUnixSeconds, classOf[UnixTimestamp] -> CometUnixTimestamp, classOf[Year] -> CometYear, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 8cade0e3d1..82dc6eeda4 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -783,3 +783,5 @@ object CometMicrosToTimestamp extends CometCodegenDispatch[MicrosToTimestamp] object CometMillisToTimestamp extends CometCodegenDispatch[MillisToTimestamp] object CometUnixSeconds extends CometCodegenDispatch[UnixSeconds] + +object CometUnixMillis extends CometCodegenDispatch[UnixMillis] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql new file mode 100644 index 0000000000..4f13ee8bf3 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_millis.sql @@ -0,0 +1,37 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes unix_millis through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_unix_millis(ts timestamp) USING parquet + +statement +INSERT INTO test_unix_millis VALUES + (timestamp('2024-06-15 10:30:45.123')), + (timestamp('1970-01-01 00:00:00')), + (timestamp('1969-12-31 23:59:59.999')), + (NULL) + +query +SELECT unix_millis(ts) FROM test_unix_millis + +-- literal argument +query +SELECT unix_millis(timestamp('2024-06-15 10:30:45.123456')) From f4e1c0e20e746e5feb487506732619f14123c819 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:47:45 -0600 Subject: [PATCH 12/20] feat(datetime): route UnixMicros through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../expressions/datetime/unix_micros.sql | 37 +++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 8e5c0bd6de..59aa4d30fe 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -245,6 +245,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[SecondsToTimestamp] -> CometSecondsToTimestamp, classOf[TruncDate] -> CometTruncDate, classOf[TruncTimestamp] -> CometTruncTimestamp, + classOf[UnixMicros] -> CometUnixMicros, classOf[UnixMillis] -> CometUnixMillis, classOf[UnixSeconds] -> CometUnixSeconds, classOf[UnixTimestamp] -> CometUnixTimestamp, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 82dc6eeda4..6197894f75 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -785,3 +785,5 @@ object CometMillisToTimestamp extends CometCodegenDispatch[MillisToTimestamp] object CometUnixSeconds extends CometCodegenDispatch[UnixSeconds] object CometUnixMillis extends CometCodegenDispatch[UnixMillis] + +object CometUnixMicros extends CometCodegenDispatch[UnixMicros] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql new file mode 100644 index 0000000000..07c0b1d715 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_micros.sql @@ -0,0 +1,37 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes unix_micros through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_unix_micros(ts timestamp) USING parquet + +statement +INSERT INTO test_unix_micros VALUES + (timestamp('2024-06-15 10:30:45.123456')), + (timestamp('1970-01-01 00:00:00')), + (timestamp('1969-12-31 23:59:59.999999')), + (NULL) + +query +SELECT unix_micros(ts) FROM test_unix_micros + +-- literal argument +query +SELECT unix_micros(timestamp('2024-06-15 10:30:45.123456')) From eebeef8ed747d6fd08daeffb27d472c7aa06e062 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 09:57:28 -0600 Subject: [PATCH 13/20] feat(datetime): route ToUnixTimestamp through codegen dispatcher [skip ci] --- .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/datetime.scala | 4 +- .../datetime/to_unix_timestamp.sql | 40 +++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp.sql diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 59aa4d30fe..bbb64133cf 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -245,6 +245,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[SecondsToTimestamp] -> CometSecondsToTimestamp, classOf[TruncDate] -> CometTruncDate, classOf[TruncTimestamp] -> CometTruncTimestamp, + classOf[ToUnixTimestamp] -> CometToUnixTimestamp, classOf[UnixMicros] -> CometUnixMicros, classOf[UnixMillis] -> CometUnixMillis, classOf[UnixSeconds] -> CometUnixSeconds, diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 6197894f75..23da6d8671 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, ToUnixTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -787,3 +787,5 @@ object CometUnixSeconds extends CometCodegenDispatch[UnixSeconds] object CometUnixMillis extends CometCodegenDispatch[UnixMillis] object CometUnixMicros extends CometCodegenDispatch[UnixMicros] + +object CometToUnixTimestamp extends CometCodegenDispatch[ToUnixTimestamp] diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp.sql new file mode 100644 index 0000000000..7654d5ff32 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp.sql @@ -0,0 +1,40 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Routes to_unix_timestamp through the codegen dispatcher. +-- Config: spark.sql.session.timeZone=America/Los_Angeles +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +statement +CREATE TABLE test_to_unix_timestamp(s string) USING parquet + +statement +INSERT INTO test_to_unix_timestamp VALUES + ('2024-06-15 10:30:45'), + ('1970-01-01 00:00:00'), + ('1969-12-31 23:59:59'), + (NULL) + +query +SELECT to_unix_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_to_unix_timestamp + +query +SELECT to_unix_timestamp(s) FROM test_to_unix_timestamp + +-- literal argument +query +SELECT to_unix_timestamp('2024-06-15 10:30:45', 'yyyy-MM-dd HH:mm:ss') From 771f367493cc4592c427df492d3f24bf7de614c5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 10:08:23 -0600 Subject: [PATCH 14/20] test(datetime): ANSI fixtures for codegen-routed throw-capable expressions [skip ci] --- .../datetime/make_timestamp_ansi.sql | 35 +++++++++++++++++++ .../datetime/timestamp_millis_ansi.sql | 28 +++++++++++++++ .../datetime/to_unix_timestamp_ansi.sql | 28 +++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql new file mode 100644 index 0000000000..52ca3b7972 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: make_timestamp throws on out-of-range argument values. With the codegen +-- dispatcher enabled, Spark's own MakeTimestamp.doGenCode produces the throw site, so +-- Comet's kernel raises the same exception as Spark. +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +-- month out of range +query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) +SELECT make_timestamp(2024, 13, 1, 0, 0, 0) + +-- day out of range +query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) +SELECT make_timestamp(2024, 2, 30, 0, 0, 0) + +-- hour out of range +query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) +SELECT make_timestamp(2024, 6, 15, 25, 0, 0) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql new file mode 100644 index 0000000000..3a16dc5dad --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql @@ -0,0 +1,28 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: timestamp_millis throws on overflow. The codegen dispatcher inherits the +-- throw from Spark's own MillisToTimestamp.doGenCode. +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +query expect_error(overflow) +SELECT timestamp_millis(9223372036854775807L) + +query expect_error(overflow) +SELECT timestamp_millis(-9223372036854775808L) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql new file mode 100644 index 0000000000..5febafaecc --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql @@ -0,0 +1,28 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: to_unix_timestamp throws on parse failure. The codegen dispatcher inherits +-- the throw from Spark's own ToUnixTimestamp.doGenCode. +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true + +query expect_error(CANNOT_PARSE_TIMESTAMP) +SELECT to_unix_timestamp('not a date', 'yyyy-MM-dd') + +query expect_error(CANNOT_PARSE_TIMESTAMP) +SELECT to_unix_timestamp('2024-13-99', 'yyyy-MM-dd') From 58713398a494e386acb4c22cad79ecf6853fac5a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 10:11:53 -0600 Subject: [PATCH 15/20] test(codegen): unit coverage for Bucket 4 datetime kernel source [skip ci] --- .../comet/CometCodegenSourceSuite.scala | 81 ++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala index 274b70bce1..aad8b41526 100644 --- a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala @@ -22,7 +22,7 @@ package org.apache.comet import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Add, BoundReference, Coalesce, Concat, CreateArray, CreateMap, DateFormatClass, ElementAt, Expression, GetStructField, LeafExpression, Length, Literal, Nondeterministic, Rand, Size, Unevaluable, Upper} +import org.apache.spark.sql.catalyst.expressions.{Add, AddMonths, BoundReference, Coalesce, Concat, CreateArray, CreateMap, DateFormatClass, ElementAt, Expression, GetStructField, LeafExpression, Length, Literal, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, MonthsBetween, Nondeterministic, Rand, Size, ToUnixTimestamp, Unevaluable, UnixMicros, UnixMillis, UnixSeconds, Upper} import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, CodegenFallback, ExprCode} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -1054,6 +1054,85 @@ class CometCodegenSourceSuite extends AnyFunSuite { s"expected no null guard on non-nullable map field; got:\n$src") } + // Bucket 4 datetime expressions routed through CometCodegenDispatch. Each entry pairs a + // bound Catalyst expression with the Arrow column specs the kernel would see at runtime. + // The test asserts `generateSource` returns a non-empty body, which means Spark's own + // `doGenCode` succeeded under the codegen context (no NotImplementedError, no rewrite to + // a CodegenFallback path). + test("Bucket 4 datetime expressions produce non-empty generated kernel source") { + val intCol = ArrowColumnSpec( + CometBatchKernelCodegen.vectorClassBySimpleName("IntVector"), + nullable = true) + val longCol = ArrowColumnSpec( + CometBatchKernelCodegen.vectorClassBySimpleName("BigIntVector"), + nullable = true) + val decCol = ArrowColumnSpec( + CometBatchKernelCodegen.vectorClassBySimpleName("DecimalVector"), + nullable = true) + val dateCol = ArrowColumnSpec( + CometBatchKernelCodegen.vectorClassBySimpleName("DateDayVector"), + nullable = true) + val tsCol = ArrowColumnSpec( + CometBatchKernelCodegen.vectorClassBySimpleName("TimeStampMicroTZVector"), + nullable = true) + val strCol = ArrowColumnSpec( + CometBatchKernelCodegen.vectorClassBySimpleName("VarCharVector"), + nullable = true) + + val cases: Seq[(String, Expression, IndexedSeq[ArrowColumnSpec])] = Seq( + ("AddMonths", + AddMonths( + BoundReference(0, DateType, nullable = true), + BoundReference(1, IntegerType, nullable = true)), + IndexedSeq(dateCol, intCol)), + ("MonthsBetween", + MonthsBetween( + BoundReference(0, TimestampType, nullable = true), + BoundReference(1, TimestampType, nullable = true), + Literal(true), + Some("UTC")), + IndexedSeq(tsCol, tsCol)), + ("MakeTimestamp", + MakeTimestamp( + BoundReference(0, IntegerType, nullable = true), + BoundReference(1, IntegerType, nullable = true), + BoundReference(2, IntegerType, nullable = true), + BoundReference(3, IntegerType, nullable = true), + BoundReference(4, IntegerType, nullable = true), + BoundReference(5, DecimalType(8, 6), nullable = true), + timezone = None, + timeZoneId = Some("UTC")), + IndexedSeq(intCol, intCol, intCol, intCol, intCol, decCol)), + ("MillisToTimestamp", + MillisToTimestamp(BoundReference(0, LongType, nullable = true)), + IndexedSeq(longCol)), + ("MicrosToTimestamp", + MicrosToTimestamp(BoundReference(0, LongType, nullable = true)), + IndexedSeq(longCol)), + ("UnixSeconds", + UnixSeconds(BoundReference(0, TimestampType, nullable = true)), + IndexedSeq(tsCol)), + ("UnixMillis", + UnixMillis(BoundReference(0, TimestampType, nullable = true)), + IndexedSeq(tsCol)), + ("UnixMicros", + UnixMicros(BoundReference(0, TimestampType, nullable = true)), + IndexedSeq(tsCol)), + ("ToUnixTimestamp", + ToUnixTimestamp( + BoundReference(0, StringType, nullable = true), + Literal(UTF8String.fromString("yyyy-MM-dd HH:mm:ss"), StringType), + Some("UTC")), + IndexedSeq(strCol))) + cases.foreach { case (name, expr, specs) => + val src = CometBatchKernelCodegen.generateSource(expr, specs).body + assert(src.nonEmpty, s"$name: expected non-empty generated source") + assert( + src.contains("public java.lang.Object generate(Object[] references)"), + s"$name: generated source missing kernel class entry point") + } + } + test("CacheKey discriminates on ArrowColumnSpec.nullable") { // Structural regression: same expression bytes and same Arrow vector class with different // `nullable` must produce non-equal cache keys. The dispatcher today hardcodes `nullable=true` From fadcbfee185f287d5516680a7b61e1efd0f20f56 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 10:13:25 -0600 Subject: [PATCH 16/20] style: apply spotless formatting --- .../apache/comet/serde/CometScalaUDF.scala | 10 +++---- .../org/apache/comet/serde/datetime.scala | 2 +- .../comet/CometCodegenSourceSuite.scala | 27 ++++++++++++------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala index 0e1ca86fa2..7499fd5047 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala @@ -125,18 +125,14 @@ object CometScalaUDF extends CometExpressionSerde[ScalaUDF] { * Convenience base for serdes that route a non-ScalaUDF Spark expression through the codegen * dispatcher. Delegates `convert` to [[CometScalaUDF.emitJvmCodegenDispatch]] and marks the * expression `Compatible()` because the dispatcher runs Spark's own `doGenCode` inside the - * kernel: behavior matches Spark exactly when - * [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]] is enabled, and the operator falls back to - * Spark cleanly when it is not. + * kernel: behavior matches Spark exactly when [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]] is + * enabled, and the operator falls back to Spark cleanly when it is not. */ class CometCodegenDispatch[T <: Expression] extends CometExpressionSerde[T] { override def getSupportLevel(expr: T): SupportLevel = Compatible() override def getCompatibleNotes(): Seq[String] = Seq( "Runs via the Arrow-direct codegen dispatcher when " + "spark.comet.exec.scalaUDF.codegen.enabled=true. Default behavior falls back to Spark.") - override def convert( - expr: T, - inputs: Seq[Attribute], - binding: Boolean): Option[Expr] = + override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding) } diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 23da6d8671..e2995274ad 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, ToUnixTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUnixTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala index aad8b41526..e12a0dd147 100644 --- a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala @@ -1080,19 +1080,22 @@ class CometCodegenSourceSuite extends AnyFunSuite { nullable = true) val cases: Seq[(String, Expression, IndexedSeq[ArrowColumnSpec])] = Seq( - ("AddMonths", + ( + "AddMonths", AddMonths( BoundReference(0, DateType, nullable = true), BoundReference(1, IntegerType, nullable = true)), IndexedSeq(dateCol, intCol)), - ("MonthsBetween", + ( + "MonthsBetween", MonthsBetween( BoundReference(0, TimestampType, nullable = true), BoundReference(1, TimestampType, nullable = true), Literal(true), Some("UTC")), IndexedSeq(tsCol, tsCol)), - ("MakeTimestamp", + ( + "MakeTimestamp", MakeTimestamp( BoundReference(0, IntegerType, nullable = true), BoundReference(1, IntegerType, nullable = true), @@ -1103,22 +1106,28 @@ class CometCodegenSourceSuite extends AnyFunSuite { timezone = None, timeZoneId = Some("UTC")), IndexedSeq(intCol, intCol, intCol, intCol, intCol, decCol)), - ("MillisToTimestamp", + ( + "MillisToTimestamp", MillisToTimestamp(BoundReference(0, LongType, nullable = true)), IndexedSeq(longCol)), - ("MicrosToTimestamp", + ( + "MicrosToTimestamp", MicrosToTimestamp(BoundReference(0, LongType, nullable = true)), IndexedSeq(longCol)), - ("UnixSeconds", + ( + "UnixSeconds", UnixSeconds(BoundReference(0, TimestampType, nullable = true)), IndexedSeq(tsCol)), - ("UnixMillis", + ( + "UnixMillis", UnixMillis(BoundReference(0, TimestampType, nullable = true)), IndexedSeq(tsCol)), - ("UnixMicros", + ( + "UnixMicros", UnixMicros(BoundReference(0, TimestampType, nullable = true)), IndexedSeq(tsCol)), - ("ToUnixTimestamp", + ( + "ToUnixTimestamp", ToUnixTimestamp( BoundReference(0, StringType, nullable = true), Literal(UTF8String.fromString("yyyy-MM-dd HH:mm:ss"), StringType), From dd1723f8d33cb3ca00d864ed02c4acdc6ee246c3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 12:02:59 -0600 Subject: [PATCH 17/20] fix: address code-review feedback for codegen-routed datetime fixtures - Drop the getCompatibleNotes override on CometCodegenDispatch. The docs generator emits compat notes under a heading promising 'no additional configuration', which contradicts a note describing the dispatcher flag. Keep getSupportLevel=Compatible and surface the flag dependency via withInfo / EXPLAIN instead. - Add a sentinel non-error query to each *_ansi.sql fixture. The expect_error semantics pass vacuously when the dispatcher silently falls back to Spark (both paths throw identical exceptions); the sentinel uses checkSparkAnswerAndOperator and fails if Comet did not run the expression natively. - Pin spark.sql.legacy.timeParserPolicy=CORRECTED in to_unix_timestamp_ansi.sql so the JDK java.time formatter is exercised regardless of runtime default; LEGACY policy uses SimpleDateFormat with a different exception class. - Annotate the three ANSI fixtures with MinSparkVersion: 3.5 since the DATETIME_FIELD_OUT_OF_BOUNDS and CANNOT_PARSE_TIMESTAMP error classes were standardized in Spark 3.5. Spark 3.4 coverage is delivered separately. --- .../org/apache/comet/serde/CometScalaUDF.scala | 8 +++++--- .../datetime/make_timestamp_ansi.sql | 18 +++++++++++++++++- .../datetime/timestamp_millis_ansi.sql | 6 ++++++ .../datetime/to_unix_timestamp_ansi.sql | 15 ++++++++++++++- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala index 7499fd5047..852e80ae44 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala @@ -130,9 +130,11 @@ object CometScalaUDF extends CometExpressionSerde[ScalaUDF] { */ class CometCodegenDispatch[T <: Expression] extends CometExpressionSerde[T] { override def getSupportLevel(expr: T): SupportLevel = Compatible() - override def getCompatibleNotes(): Seq[String] = Seq( - "Runs via the Arrow-direct codegen dispatcher when " + - "spark.comet.exec.scalaUDF.codegen.enabled=true. Default behavior falls back to Spark.") + // Intentionally no getCompatibleNotes override: the docs generator emits compat notes under + // a heading that promises "no additional configuration required". The dispatcher flag is a + // global concern documented elsewhere; tagging each expression here would contradict the + // heading. When the flag is off, `convert` returns None with a clear withInfo reason that + // shows up in EXPLAIN, which is the right place for that signal. override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding) } diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql index 52ca3b7972..ac90ee3f82 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql @@ -17,10 +17,18 @@ -- ANSI mode: make_timestamp throws on out-of-range argument values. With the codegen -- dispatcher enabled, Spark's own MakeTimestamp.doGenCode produces the throw site, so --- Comet's kernel raises the same exception as Spark. +-- Comet's kernel raises the same exception as Spark. The expect_error substring matches +-- the DATETIME_FIELD_OUT_OF_BOUNDS error class that Spark 3.5+ wraps all three cases in +-- (the inner JDK message text varies: "Invalid value for MonthOfYear", "Invalid date +-- 'FEBRUARY 30'", "Invalid value for HourOfDay"); the error class is the only stable +-- common substring. -- Config: spark.sql.session.timeZone=UTC -- Config: spark.sql.ansi.enabled=true -- Config: spark.comet.exec.scalaUDF.codegen.enabled=true +-- The DATETIME_FIELD_OUT_OF_BOUNDS error class was standardized in Spark 3.5; earlier +-- versions wrap the JDK DateTimeException with a generic _LEGACY_ERROR_TEMP_ code whose +-- message does not contain that substring. +-- MinSparkVersion: 3.5 -- month out of range query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) @@ -33,3 +41,11 @@ SELECT make_timestamp(2024, 2, 30, 0, 0, 0) -- hour out of range query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) SELECT make_timestamp(2024, 6, 15, 25, 0, 0) + +-- Sentinel: a valid input must still execute on the Comet codegen path. If the dispatcher +-- silently rejects MakeTimestamp at runtime, the operator falls back to Spark and the +-- error queries above pass vacuously (Spark and fallback throw identical messages). This +-- non-error query uses `checkSparkAnswerAndOperator` which fails if Comet did not run the +-- expression natively, so a regression that breaks the dispatch path surfaces here. +query +SELECT make_timestamp(2024, 6, 15, 10, 30, 45.0) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql index 3a16dc5dad..d3c26a99d2 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/timestamp_millis_ansi.sql @@ -26,3 +26,9 @@ SELECT timestamp_millis(9223372036854775807L) query expect_error(overflow) SELECT timestamp_millis(-9223372036854775808L) + +-- Sentinel: confirms Comet ran the expression natively. If the dispatcher silently rejects +-- MillisToTimestamp, the error queries above pass vacuously via Spark fallback. This valid +-- query uses `checkSparkAnswerAndOperator` and fails if Comet did not execute it natively. +query +SELECT timestamp_millis(1718451045000) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql index 5febafaecc..09250b9b06 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi.sql @@ -16,13 +16,26 @@ -- under the License. -- ANSI mode: to_unix_timestamp throws on parse failure. The codegen dispatcher inherits --- the throw from Spark's own ToUnixTimestamp.doGenCode. +-- the throw from Spark's own ToUnixTimestamp.doGenCode. The time parser policy is pinned +-- to CORRECTED so the JDK java.time formatter (and the CANNOT_PARSE_TIMESTAMP error class) +-- is exercised regardless of the runtime default — LEGACY uses SimpleDateFormat with a +-- different exception text. -- Config: spark.sql.session.timeZone=UTC -- Config: spark.sql.ansi.enabled=true +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED -- Config: spark.comet.exec.scalaUDF.codegen.enabled=true +-- The CANNOT_PARSE_TIMESTAMP error class was standardized in Spark 3.5; earlier versions +-- surface a different SparkUpgradeException / DateTimeParseException wording. +-- MinSparkVersion: 3.5 query expect_error(CANNOT_PARSE_TIMESTAMP) SELECT to_unix_timestamp('not a date', 'yyyy-MM-dd') query expect_error(CANNOT_PARSE_TIMESTAMP) SELECT to_unix_timestamp('2024-13-99', 'yyyy-MM-dd') + +-- Sentinel: confirms Comet ran the expression natively. If the dispatcher silently rejects +-- ToUnixTimestamp, the error queries above pass vacuously via Spark fallback. This valid +-- query uses `checkSparkAnswerAndOperator` and fails if Comet did not execute it natively. +query +SELECT to_unix_timestamp('2024-06-15 10:30:45', 'yyyy-MM-dd HH:mm:ss') From 6a22b63c67b89acbdb4e461e4d4f4fb86b82da07 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 12:03:12 -0600 Subject: [PATCH 18/20] test: add MaxSparkVersion annotation and Spark 3.4 ANSI fixtures Mirror the existing MinSparkVersion gate with a MaxSparkVersion gate so SQL fixtures can pair a 3.5+ variant (using post-3.5 error class names) with a 3.4 variant (using the pre-classification JDK java.time exception text). The make_timestamp and to_unix_timestamp ANSI exception paths produce different exception wording on Spark 3.4 versus 3.5+; before this commit only the 3.5+ side had coverage and 3.4 ANSI behavior went untested. Framework: - SqlTestFile gains maxSparkVersion: Option[String]. - SqlFileTestParser recognizes -- MaxSparkVersion: lines. - CometSqlFileTestSuite gains meetsMaxSparkVersion / skipReason helpers; the skip-and-log path now reports whether the constraint was a floor or ceiling. Coverage: - make_timestamp_ansi_spark34.sql: MaxSparkVersion: 3.4, expect_error patterns target the JDK DateTimeException field-name text (MonthOfYear, Invalid date, HourOfDay) which is stable in 3.4's pre-classification error path. - to_unix_timestamp_ansi_spark34.sql: MaxSparkVersion: 3.4, expect_error pattern targets the JDK DateTimeParseException 'could not be parsed' wording. --- .../datetime/make_timestamp_ansi_spark34.sql | 46 +++++++++++++++++ .../to_unix_timestamp_ansi_spark34.sql | 39 +++++++++++++++ .../apache/comet/CometSqlFileTestSuite.scala | 49 ++++++++++++++----- .../org/apache/comet/SqlFileTestParser.scala | 24 +++++++-- 4 files changed, 144 insertions(+), 14 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi_spark34.sql diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql new file mode 100644 index 0000000000..ee09ed98f0 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql @@ -0,0 +1,46 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: make_timestamp throws on out-of-range argument values. Same coverage as +-- make_timestamp_ansi.sql, but the expect_error substrings target the JDK java.time +-- DateTimeException message text directly because Spark 3.4 does not yet wrap these in +-- the DATETIME_FIELD_OUT_OF_BOUNDS error class (that classification arrived in Spark 3.5). +-- The inner JDK message text is stable on Spark 3.4: "Invalid value for MonthOfYear", +-- "Invalid date 'FEBRUARY 30'", "Invalid value for HourOfDay". +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true +-- MaxSparkVersion: 3.4 + +-- month out of range +query expect_error(MonthOfYear) +SELECT make_timestamp(2024, 13, 1, 0, 0, 0) + +-- day out of range +query expect_error(Invalid date) +SELECT make_timestamp(2024, 2, 30, 0, 0, 0) + +-- hour out of range +query expect_error(HourOfDay) +SELECT make_timestamp(2024, 6, 15, 25, 0, 0) + +-- Sentinel: a valid input must still execute on the Comet codegen path. If the dispatcher +-- silently rejects MakeTimestamp at runtime, the error queries above pass vacuously +-- (Spark and fallback throw identical messages). This non-error query uses +-- `checkSparkAnswerAndOperator` which fails if Comet did not run the expression natively. +query +SELECT make_timestamp(2024, 6, 15, 10, 30, 45.0) diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi_spark34.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi_spark34.sql new file mode 100644 index 0000000000..4fed3ff654 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_ansi_spark34.sql @@ -0,0 +1,39 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ANSI mode: to_unix_timestamp throws on parse failure. Same coverage as +-- to_unix_timestamp_ansi.sql, but the expect_error substring targets the JDK +-- DateTimeParseException message text because Spark 3.4 does not yet wrap these in the +-- CANNOT_PARSE_TIMESTAMP error class (that classification arrived in Spark 3.5). The JDK +-- message "Text 'not a date' could not be parsed" is stable on Spark 3.4. +-- Config: spark.sql.session.timeZone=UTC +-- Config: spark.sql.ansi.enabled=true +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.comet.exec.scalaUDF.codegen.enabled=true +-- MaxSparkVersion: 3.4 + +query expect_error(could not be parsed) +SELECT to_unix_timestamp('not a date', 'yyyy-MM-dd') + +query expect_error(could not be parsed) +SELECT to_unix_timestamp('2024-13-99', 'yyyy-MM-dd') + +-- Sentinel: confirms Comet ran the expression natively. If the dispatcher silently rejects +-- ToUnixTimestamp, the error queries above pass vacuously via Spark fallback. This valid +-- query uses `checkSparkAnswerAndOperator` and fails if Comet did not execute it natively. +query +SELECT to_unix_timestamp('2024-06-15 10:30:45', 'yyyy-MM-dd HH:mm:ss') diff --git a/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala index 47642f2357..254eec1aa3 100644 --- a/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala @@ -35,6 +35,31 @@ class CometSqlFileTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { (current(0) == required(0) && current(1) >= required(1)) } + /** + * Check if the current Spark version is at or below a maximum version. Used by paired + * fixtures where each version range has its own expected error class or output format. + */ + private def meetsMaxSparkVersion(maxVersion: String): Boolean = { + val current = org.apache.spark.SPARK_VERSION.split("[.-]").take(2).map(_.toInt) + val ceiling = maxVersion.split("[.-]").take(2).map(_.toInt) + (current(0) < ceiling(0)) || + (current(0) == ceiling(0) && current(1) <= ceiling(1)) + } + + /** + * Build a human-readable reason string describing why a fixture is skipped on the current + * Spark version. Returns None when both constraints are satisfied. + */ + private def skipReason(parsed: SqlTestFile): Option[String] = { + val minViolation = parsed.minSparkVersion.filter(!meetsMinSparkVersion(_)) + val maxViolation = parsed.maxSparkVersion.filter(!meetsMaxSparkVersion(_)) + (minViolation, maxViolation) match { + case (Some(m), _) => Some(s"requires Spark >= $m") + case (_, Some(m)) => Some(s"requires Spark <= $m") + case _ => None + } + } + private val testResourceDir = { val url = getClass.getClassLoader.getResource("sql-tests") assert(url != null, "Could not find sql-tests resource directory") @@ -133,17 +158,18 @@ class CometSqlFileTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { val parsed = SqlFileTestParser.parse(file) val combinations = configMatrix(parsed.configMatrix) - // Skip tests that require a newer Spark version - val skip = parsed.minSparkVersion.exists(!meetsMinSparkVersion(_)) + // Skip tests that fall outside the file's declared Spark version range. + val skip = skipReason(parsed) if (combinations.size <= 1) { // No matrix or single combination test(s"sql-file: $relativePath") { - if (skip) { - logInfo(s"SKIPPED (requires Spark ${parsed.minSparkVersion.get}): $relativePath") - } else { - val effectiveConfigs = parsed.configs ++ combinations.headOption.getOrElse(Seq.empty) - runTestFile(relativePath, parsed.copy(configs = effectiveConfigs)) + skip match { + case Some(reason) => + logInfo(s"SKIPPED ($reason): $relativePath") + case None => + val effectiveConfigs = parsed.configs ++ combinations.headOption.getOrElse(Seq.empty) + runTestFile(relativePath, parsed.copy(configs = effectiveConfigs)) } } } else { @@ -151,10 +177,11 @@ class CometSqlFileTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { combinations.foreach { matrixConfigs => val label = matrixConfigs.map { case (k, v) => s"$k=$v" }.mkString(", ") test(s"sql-file: $relativePath [$label]") { - if (skip) { - logInfo(s"SKIPPED (requires Spark ${parsed.minSparkVersion.get}): $relativePath") - } else { - runTestFile(relativePath, parsed.copy(configs = parsed.configs ++ matrixConfigs)) + skip match { + case Some(reason) => + logInfo(s"SKIPPED ($reason): $relativePath") + case None => + runTestFile(relativePath, parsed.copy(configs = parsed.configs ++ matrixConfigs)) } } } diff --git a/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala b/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala index 45198ed176..31512642b6 100644 --- a/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala +++ b/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala @@ -69,20 +69,27 @@ case class ExpectError(pattern: String) extends QueryAssertionMode * @param tables * Table names extracted from CREATE TABLE statements (for cleanup). * @param minSparkVersion - * Optional minimum Spark version required to run this test (e.g. "3.5"). + * Optional minimum Spark version required to run this test (e.g. "3.5"). The test is + * skipped on older versions. + * @param maxSparkVersion + * Optional maximum Spark version this test applies to (e.g. "3.4"). The test is skipped + * on newer versions. Useful for paired fixtures where each version range has its own + * expected error class or output format. */ case class SqlTestFile( configs: Seq[(String, String)], configMatrix: Seq[(String, Seq[String])], records: Seq[SqlTestRecord], tables: Seq[String], - minSparkVersion: Option[String] = None) + minSparkVersion: Option[String] = None, + maxSparkVersion: Option[String] = None) object SqlFileTestParser { private val ConfigPattern = """--\s*Config:\s*(.+)=(.+)""".r private val ConfigMatrixPattern = """--\s*ConfigMatrix:\s*(.+)=(.+)""".r private val MinSparkVersionPattern = """--\s*MinSparkVersion:\s*(.+)""".r + private val MaxSparkVersionPattern = """--\s*MaxSparkVersion:\s*(.+)""".r private val CreateTablePattern = """(?i)CREATE\s+TABLE\s+(\w+)""".r.unanchored def parse(file: File): SqlTestFile = { @@ -98,6 +105,7 @@ object SqlFileTestParser { var configs = Seq.empty[(String, String)] var configMatrix = Seq.empty[(String, Seq[String])] var minSparkVersion: Option[String] = None + var maxSparkVersion: Option[String] = None val records = Seq.newBuilder[SqlTestRecord] val tables = Seq.newBuilder[String] @@ -118,6 +126,10 @@ object SqlFileTestParser { minSparkVersion = Some(version.trim) lineIdx += 1 + case MaxSparkVersionPattern(version) => + maxSparkVersion = Some(version.trim) + lineIdx += 1 + case "statement" => lineIdx += 1 val startLine = lineIdx + 1 @@ -141,7 +153,13 @@ object SqlFileTestParser { } } - SqlTestFile(configs, configMatrix, records.result(), tables.result(), minSparkVersion) + SqlTestFile( + configs, + configMatrix, + records.result(), + tables.result(), + minSparkVersion, + maxSparkVersion) } private val FallbackPattern = """query\s+expect_fallback\((.+)\)""".r From 1b952e773286b63c38acd175a4852ac42916682f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 12:16:12 -0600 Subject: [PATCH 19/20] style: apply spotless to MaxSparkVersion javadoc --- .../scala/org/apache/comet/CometSqlFileTestSuite.scala | 8 ++++---- .../scala/org/apache/comet/SqlFileTestParser.scala | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala index 254eec1aa3..53f21fe500 100644 --- a/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometSqlFileTestSuite.scala @@ -36,8 +36,8 @@ class CometSqlFileTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { } /** - * Check if the current Spark version is at or below a maximum version. Used by paired - * fixtures where each version range has its own expected error class or output format. + * Check if the current Spark version is at or below a maximum version. Used by paired fixtures + * where each version range has its own expected error class or output format. */ private def meetsMaxSparkVersion(maxVersion: String): Boolean = { val current = org.apache.spark.SPARK_VERSION.split("[.-]").take(2).map(_.toInt) @@ -47,8 +47,8 @@ class CometSqlFileTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { } /** - * Build a human-readable reason string describing why a fixture is skipped on the current - * Spark version. Returns None when both constraints are satisfied. + * Build a human-readable reason string describing why a fixture is skipped on the current Spark + * version. Returns None when both constraints are satisfied. */ private def skipReason(parsed: SqlTestFile): Option[String] = { val minViolation = parsed.minSparkVersion.filter(!meetsMinSparkVersion(_)) diff --git a/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala b/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala index 31512642b6..db01bc166f 100644 --- a/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala +++ b/spark/src/test/scala/org/apache/comet/SqlFileTestParser.scala @@ -69,12 +69,12 @@ case class ExpectError(pattern: String) extends QueryAssertionMode * @param tables * Table names extracted from CREATE TABLE statements (for cleanup). * @param minSparkVersion - * Optional minimum Spark version required to run this test (e.g. "3.5"). The test is - * skipped on older versions. + * Optional minimum Spark version required to run this test (e.g. "3.5"). The test is skipped on + * older versions. * @param maxSparkVersion - * Optional maximum Spark version this test applies to (e.g. "3.4"). The test is skipped - * on newer versions. Useful for paired fixtures where each version range has its own - * expected error class or output format. + * Optional maximum Spark version this test applies to (e.g. "3.4"). The test is skipped on + * newer versions. Useful for paired fixtures where each version range has its own expected + * error class or output format. */ case class SqlTestFile( configs: Seq[(String, String)], From 9789272332ccafa6375e2048e0af76cf7e89e003 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 24 May 2026 13:14:29 -0600 Subject: [PATCH 20/20] fix: use JDK java.time field names in make_timestamp_ansi expect_error CI failed on Spark 3.5.8 because the executor-thrown SparkDateTimeException's getMessage() does NOT preserve the driver-formatted '[DATETIME_FIELD_OUT_OF_BOUNDS]' error-class prefix; only the inner JDK message ('Invalid value for MonthOfYear ...', 'Invalid date FEBRUARY 30', 'Invalid value for HourOfDay ...') survives the 'Job aborted ... Lost task ... SparkDateTimeException: ' wrapping that shows up in the test's caught exception. Switching to the JDK java.time field-name substrings (MonthOfYear, Invalid date, HourOfDay) makes the assertions stable across Spark 3.4, 3.5.x, and 4.x without needing a MinSparkVersion gate, so the make_timestamp_ansi_spark34.sql variant becomes redundant and is deleted in the same commit. Verified locally: passes under -Pspark-3.4 (3.4.3) and -Pspark-3.5 (3.5.8). --- .../datetime/make_timestamp_ansi.sql | 22 ++++----- .../datetime/make_timestamp_ansi_spark34.sql | 46 ------------------- 2 files changed, 10 insertions(+), 58 deletions(-) delete mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql index ac90ee3f82..16ffa09d8b 100644 --- a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql +++ b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi.sql @@ -17,29 +17,27 @@ -- ANSI mode: make_timestamp throws on out-of-range argument values. With the codegen -- dispatcher enabled, Spark's own MakeTimestamp.doGenCode produces the throw site, so --- Comet's kernel raises the same exception as Spark. The expect_error substring matches --- the DATETIME_FIELD_OUT_OF_BOUNDS error class that Spark 3.5+ wraps all three cases in --- (the inner JDK message text varies: "Invalid value for MonthOfYear", "Invalid date --- 'FEBRUARY 30'", "Invalid value for HourOfDay"); the error class is the only stable --- common substring. +-- Comet's kernel raises the same exception as Spark. The expect_error substrings target +-- the inner JDK java.time.DateTimeException message text (which is wrapped in a +-- SparkDateTimeException whose getMessage() preserves it). The driver-formatted error +-- class string `DATETIME_FIELD_OUT_OF_BOUNDS` is NOT preserved when the exception is +-- thrown from a task on the executor side (only the wrapped `Job aborted ... Lost task +-- ... SparkDateTimeException: ` form is preserved), so we match the JDK +-- field names which are stable from Spark 3.4 through 4.x. -- Config: spark.sql.session.timeZone=UTC -- Config: spark.sql.ansi.enabled=true -- Config: spark.comet.exec.scalaUDF.codegen.enabled=true --- The DATETIME_FIELD_OUT_OF_BOUNDS error class was standardized in Spark 3.5; earlier --- versions wrap the JDK DateTimeException with a generic _LEGACY_ERROR_TEMP_ code whose --- message does not contain that substring. --- MinSparkVersion: 3.5 -- month out of range -query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) +query expect_error(MonthOfYear) SELECT make_timestamp(2024, 13, 1, 0, 0, 0) -- day out of range -query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) +query expect_error(Invalid date) SELECT make_timestamp(2024, 2, 30, 0, 0, 0) -- hour out of range -query expect_error(DATETIME_FIELD_OUT_OF_BOUNDS) +query expect_error(HourOfDay) SELECT make_timestamp(2024, 6, 15, 25, 0, 0) -- Sentinel: a valid input must still execute on the Comet codegen path. If the dispatcher diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql b/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql deleted file mode 100644 index ee09ed98f0..0000000000 --- a/spark/src/test/resources/sql-tests/expressions/datetime/make_timestamp_ansi_spark34.sql +++ /dev/null @@ -1,46 +0,0 @@ --- Licensed to the Apache Software Foundation (ASF) under one --- or more contributor license agreements. See the NOTICE file --- distributed with this work for additional information --- regarding copyright ownership. The ASF licenses this file --- to you under the Apache License, Version 2.0 (the --- "License"); you may not use this file except in compliance --- with the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, --- software distributed under the License is distributed on an --- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --- KIND, either express or implied. See the License for the --- specific language governing permissions and limitations --- under the License. - --- ANSI mode: make_timestamp throws on out-of-range argument values. Same coverage as --- make_timestamp_ansi.sql, but the expect_error substrings target the JDK java.time --- DateTimeException message text directly because Spark 3.4 does not yet wrap these in --- the DATETIME_FIELD_OUT_OF_BOUNDS error class (that classification arrived in Spark 3.5). --- The inner JDK message text is stable on Spark 3.4: "Invalid value for MonthOfYear", --- "Invalid date 'FEBRUARY 30'", "Invalid value for HourOfDay". --- Config: spark.sql.session.timeZone=UTC --- Config: spark.sql.ansi.enabled=true --- Config: spark.comet.exec.scalaUDF.codegen.enabled=true --- MaxSparkVersion: 3.4 - --- month out of range -query expect_error(MonthOfYear) -SELECT make_timestamp(2024, 13, 1, 0, 0, 0) - --- day out of range -query expect_error(Invalid date) -SELECT make_timestamp(2024, 2, 30, 0, 0, 0) - --- hour out of range -query expect_error(HourOfDay) -SELECT make_timestamp(2024, 6, 15, 25, 0, 0) - --- Sentinel: a valid input must still execute on the Comet codegen path. If the dispatcher --- silently rejects MakeTimestamp at runtime, the error queries above pass vacuously --- (Spark and fallback throw identical messages). This non-error query uses --- `checkSparkAnswerAndOperator` which fails if Comet did not run the expression natively. -query -SELECT make_timestamp(2024, 6, 15, 10, 30, 45.0)