ARROW-14297 [R] smooth out integer division to better match R

paleolimbot · jonkeane · commit 24689928da8b · 2021-11-17T08:34:46.000-06:00
This PR updates the floor division dplyr translation to better respect the input types (as determined by how this would be done in R). The main change is the output type: `integer_type_1 %/% integer_type_2` will now have the same type as `integer_type_1` and everything else has the same type as `floor(arg1 / arg2)`. As a side effect, floor division by zero is `Inf` rather than the maximum integer value (unless you try to floor divide by `0L`...see below). A few things that need some hashing out: - Floor division by `0L` results in the max integer value rather than `NA`. This is, I think, because it's how cast (even with `safe = TRUE`) to integer from `Inf`. That is perhaps a different issue than this one? - There's [some tests for floor division for arrays outside a dplyr verb](https://github.com/apache/arrow/blob/master/r/tests/testthat/test-compute-arith.R#L64-L94) that appear to be using a [completely different translation logic](https://github.com/apache/arrow/blob/master/r/R/compute.R). I didn't update those tests or that logic because it seemed like a different issue to me (maybe needs to implement the Math and/or Ops group generics or more S3 methods for the array class?). Reprex before this PR: <details> ``` r # remotes::install_github("apache/arrow/r") library(arrow, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) tbl <- tibble::tibble( integers = c(1:4, NA_integer_), doubles = c(as.numeric(1:4), NA_real_) ) tbl %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0 ) %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <dbl> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> NA, NA, NA, NA, NA #> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_dbl <dbl> 0, 1, 1, 2, NA #> $ dbl_div_int <dbl> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA RecordBatch$create(!!! tbl) %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0, ) %>% collect() %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <int> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ int_div_zero_dbl <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ dbl_div_dbl <int> 0, 1, 1, 2, NA #> $ dbl_div_int <int> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ dbl_div_zero_dbl <int> 2147483647, 2147483647, 2147483647, 2147483647, NA ``` <sup>Created on 2021-11-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup> </details> Reprex after this PR: <details> ``` r # remotes::install_github("paleolimbot/arrow/r@r-floor-div") library(arrow, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) tbl <- tibble::tibble( integers = c(1:4, NA_integer_), doubles = c(as.numeric(1:4), NA_real_) ) tbl %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0 ) %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <dbl> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> NA, NA, NA, NA, NA #> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_dbl <dbl> 0, 1, 1, 2, NA #> $ dbl_div_int <dbl> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA RecordBatch$create(!!! tbl) %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0, ) %>% collect() %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <dbl> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_dbl <dbl> 0, 1, 1, 2, NA #> $ dbl_div_int <dbl> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA ``` <sup>Created on 2021-11-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup> </details> Closes #11652 from paleolimbot/r-floor-div Authored-by: Dewey Dunnington <dewey@fishandwhistle.net> Signed-off-by: Jonathan Keane <jkeane@gmail.com>
diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R
@@ -102,8 +102,24 @@ eval_array_expression <- function(FUN,
     args <- map(args, ~ .$cast(float64()))
   } else if (FUN == "%/%") {
     # In R, integer division works like floor(float division)
-    out <- eval_array_expression("/", args = args, options = options)
-    return(out$cast(int32(), allow_float_truncate = TRUE))
+    out <- eval_array_expression("/", args = args)
+
+    # integer output only for all integer input
+    int_type_ids <- Type[toupper(INTEGER_TYPES)]
+    numerator_is_int <- args[[1]]$type_id() %in% int_type_ids
+    denominator_is_int <- args[[2]]$type_id() %in% int_type_ids
+
+    if (numerator_is_int && denominator_is_int) {
+      out_float <- eval_array_expression(
+        "if_else",
+        eval_array_expression("equal", args[[2]], 0L),
+        Scalar$create(NA_integer_),
+        eval_array_expression("floor", out)
+      )
+      return(out_float$cast(args[[1]]$type))
+    } else {
+      return(eval_array_expression("floor", out))
+    }
   } else if (FUN == "%%") {
     # We can't simply do {e1 - e2 * ( e1 %/% e2 )} since Ops.Array evaluates
     # eagerly, but we can build that up
diff --git a/r/R/expression.R b/r/R/expression.R
@@ -216,7 +216,23 @@ build_expr <- function(FUN,
     } else if (FUN == "%/%") {
       # In R, integer division works like floor(float division)
       out <- build_expr("/", args = args)
-      return(out$cast(int32(), allow_float_truncate = TRUE))
+
+      # integer output only for all integer input
+      int_type_ids <- Type[toupper(INTEGER_TYPES)]
+      numerator_is_int <- args[[1]]$type_id() %in% int_type_ids
+      denominator_is_int <- args[[2]]$type_id() %in% int_type_ids
+
+      if (numerator_is_int && denominator_is_int) {
+        out_float <- build_expr(
+          "if_else",
+          build_expr("equal", args[[2]], 0L),
+          Scalar$create(NA_integer_),
+          build_expr("floor", out)
+        )
+        return(out_float$cast(args[[1]]$type()))
+      } else {
+        return(build_expr("floor", out))
+      }
     } else if (FUN == "%%") {
       return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]]))
     }
diff --git a/r/tests/testthat/test-compute-arith.R b/r/tests/testthat/test-compute-arith.R
@@ -64,16 +64,22 @@ test_that("Multiplication", {
 test_that("Division", {
   a <- Array$create(c(1:4, NA_integer_))
   expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
-  expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(a %/% 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+  expect_equal(a %/% 2, Array$create(c(0, 1, 1, 2, NA_real_)))
+  expect_equal(a %/% 2L, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(a %/% 0L, Array$create(rep(NA_integer_, 5)))
   expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
-  expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
+  expect_equal(a %/% 2L %/% 2L, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
   expect_equal(a / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
   # TODO add tests for integer division %/% by 0
   # see https://issues.apache.org/jira/browse/ARROW-14297
 
   b <- a$cast(float64())
   expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
-  expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(b %/% 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+  expect_equal(b %/% 0L, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+  expect_equal(b %/% 2, Array$create(c(0, 1, 1, 2, NA_real_)))
+  expect_equal(b %/% 2L, Array$create(c(0, 1, 1, 2, NA_real_)))
   expect_equal(b / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
   # TODO add tests for integer division %/% by 0
   # see https://issues.apache.org/jira/browse/ARROW-14297
@@ -86,7 +92,7 @@ test_that("Division", {
   # c(1:4) %/% 2.2             == c(0L, 0L, 1L, 1L)
   # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
   # nolint end
-  expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))
+  expect_equal(b %/% 2.2, Array$create(c(0, 0, 1, 1, NA_integer_)))
 
   expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))
 
diff --git a/r/tests/testthat/test-dplyr-funcs-math.R b/r/tests/testthat/test-dplyr-funcs-math.R
@@ -307,3 +307,27 @@ test_that("arith functions ", {
     df
   )
 })
+
+test_that("floor division maintains type consistency with R",  {
+  df <- tibble(
+    integers = c(1:4, NA_integer_),
+    doubles = c(as.numeric(1:4), NA_real_)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      transmute(
+        int_div_dbl = integers %/% 2,
+        int_div_int = integers %/% 2L,
+        int_div_zero_int = integers %/% 0L,
+        int_div_zero_dbl = integers %/% 0,
+
+        dbl_div_dbl = doubles %/% 2,
+        dbl_div_int = doubles %/% 2L,
+        dbl_div_zero_int = doubles %/% 0L,
+        dbl_div_zero_dbl = doubles %/% 0
+      ) %>%
+      collect(),
+    df
+  )
+})