Rdatatable · ben-schwen · Jan 15, 2026 · Oct 28, 2025 · Oct 29, 2025 · Oct 30, 2025
@@ -18,6 +18,14 @@
 
 2. `[,showProgress=]` and `options(datatable.showProgress)` now accept an integer to control the progress bar update interval in seconds, allowing finer control over progress reporting frequency; `TRUE` uses the default 3-second interval, [#6514](https://github.com/Rdatatable/data.table/issues/6514). Thanks @ethanbsmith for the report and @ben-schwen for the PR.
 
+3. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation.
+
+    This rewrite also introduces several new optimizations:
+      - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336)
+      - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032)
+      - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934)
+      - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815)
+
 ### BUG FIXES
 
 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix.

@@ -380,7 +380,39 @@ utf8_check = function(test_str) identical(test_str, enc2native(test_str))
 test = function(num, x, y=TRUE,
                 error=NULL, warning=NULL, message=NULL, output=NULL, notOutput=NULL, ignore.warning=NULL,
                 options=NULL, env=NULL,
-                context=NULL, requires_utf8=FALSE) {
+                context=NULL, requires_utf8=FALSE, optimize=NULL) {
+  # if optimization is provided, test across multiple optimization levels
+  if (!is.null(optimize)) {
+    if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L))
+      stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) # nocov
+    cl = match.call()
+    if ("datatable.optimize" %in% names(cl$options))
+      stopf("Trying to set optimization level through both options= and optimize=") # nocov
+    cl$optimize = NULL  # Remove optimization levels from the recursive call
+
+    # Check if y was explicitly provided (not just the default)
+    y_provided = !missing(y)
+    vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment())
+    vector_params = vector_params[lengths(vector_params) > 0L]
+    compare = !y_provided && length(optimize)>1L && !length(vector_params)
+    # When optimize has multiple levels, vector params are recycled across levels.
+    if (length(optimize) > 1L && "warning" %in% names(vector_params) && length(vector_params$warning) > 1L)
+      warningf("warning= with multiple values is recycled across optimize levels, not treated as multiple warnings in one run")
+
+    for (i in seq_along(optimize)) {
+      cl$num = num + (i - 1L) * 1e-6
+      opt_level = list(datatable.optimize = optimize[i])
+      cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level
+      for (param in names(vector_params)) {
+        val = vector_params[[param]]
+        cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels
+      }
+
+      if (compare && i == 1L) cl$y = eval(cl$x, parent.frame())
+      eval(cl, parent.frame()) # actual test call
+    }
+    return(invisible())
+  }
   if (!is.null(env)) {
     old = Sys.getenv(names(env), names=TRUE, unset=NA)
     to_unset = !lengths(env)

@@ -190,24 +190,14 @@ DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep=""))
 test(301.1, nrow(DT[,sum(B),by=C])==100010)
 
 # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too.
-local({
-  old = options(datatable.optimize=0L); on.exit(options(old))
-  set.seed(1)
-  DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a")
-  test(637.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a"))
-  test(637.2, key(DT[J(43L), a:=99L]), NULL)
-  setkey(DT, a)
-  test(637.3, key(DT[, a:=99L, by=a]), NULL)
-})
-local({
-  options(datatable.optimize=2L); on.exit(options(old))
-  set.seed(1)
-  DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a")
-  test(638.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a"))
-  test(638.2, key(DT[J(43L), a:=99L]), NULL)
-  setkey(DT,a)
-  test(638.3, key(DT[, a:=99L, by=a]), NULL)
-})
+set.seed(1)
+DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a")
+opt = c(0L,2L)
+test(637.1, optimize=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a"))
+test(637.2, optimize=opt, key(copy(DT)[J(43L), a:=99L]), NULL)
+setkey(DT, a)
+test(637.3, optimize=opt, key(copy(DT)[, a:=99L, by=a]), NULL)
+# test 637 subsumes 637 and 638 for different optimization levels
 
 # Test X[Y] slowdown, #2216
 # Many minutes in 1.8.2!  Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes