Skip to content

Commit 2468992

Browse files
paleolimbotjonkeane
authored andcommitted
ARROW-14297 [R] smooth out integer division to better match R
This PR updates the floor division dplyr translation to better respect the input types (as determined by how this would be done in R). The main change is the output type: `integer_type_1 %/% integer_type_2` will now have the same type as `integer_type_1` and everything else has the same type as `floor(arg1 / arg2)`. As a side effect, floor division by zero is `Inf` rather than the maximum integer value (unless you try to floor divide by `0L`...see below). A few things that need some hashing out: - Floor division by `0L` results in the max integer value rather than `NA`. This is, I think, because it's how cast (even with `safe = TRUE`) to integer from `Inf`. That is perhaps a different issue than this one? - There's [some tests for floor division for arrays outside a dplyr verb](https://github.com/apache/arrow/blob/master/r/tests/testthat/test-compute-arith.R#L64-L94) that appear to be using a [completely different translation logic](https://github.com/apache/arrow/blob/master/r/R/compute.R). I didn't update those tests or that logic because it seemed like a different issue to me (maybe needs to implement the Math and/or Ops group generics or more S3 methods for the array class?). Reprex before this PR: <details> ``` r # remotes::install_github("apache/arrow/r") library(arrow, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) tbl <- tibble::tibble( integers = c(1:4, NA_integer_), doubles = c(as.numeric(1:4), NA_real_) ) tbl %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0 ) %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <dbl> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> NA, NA, NA, NA, NA #> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_dbl <dbl> 0, 1, 1, 2, NA #> $ dbl_div_int <dbl> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA RecordBatch$create(!!! tbl) %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0, ) %>% collect() %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <int> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ int_div_zero_dbl <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ dbl_div_dbl <int> 0, 1, 1, 2, NA #> $ dbl_div_int <int> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ dbl_div_zero_dbl <int> 2147483647, 2147483647, 2147483647, 2147483647, NA ``` <sup>Created on 2021-11-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup> </details> Reprex after this PR: <details> ``` r # remotes::install_github("paleolimbot/arrow/r@r-floor-div") library(arrow, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) tbl <- tibble::tibble( integers = c(1:4, NA_integer_), doubles = c(as.numeric(1:4), NA_real_) ) tbl %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0 ) %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <dbl> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> NA, NA, NA, NA, NA #> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_dbl <dbl> 0, 1, 1, 2, NA #> $ dbl_div_int <dbl> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA RecordBatch$create(!!! tbl) %>% mutate( int_div_dbl = integers %/% 2, int_div_int = integers %/% 2L, int_div_zero_int = integers %/% 0L, int_div_zero_dbl = integers %/% 0, dbl_div_dbl = doubles %/% 2, dbl_div_int = doubles %/% 2L, dbl_div_zero_int = doubles %/% 0L, dbl_div_zero_dbl = doubles %/% 0, ) %>% collect() %>% glimpse() #> Rows: 5 #> Columns: 10 #> $ integers <int> 1, 2, 3, 4, NA #> $ doubles <dbl> 1, 2, 3, 4, NA #> $ int_div_dbl <dbl> 0, 1, 1, 2, NA #> $ int_div_int <int> 0, 1, 1, 2, NA #> $ int_div_zero_int <int> 2147483647, 2147483647, 2147483647, 2147483647, NA #> $ int_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_dbl <dbl> 0, 1, 1, 2, NA #> $ dbl_div_int <dbl> 0, 1, 1, 2, NA #> $ dbl_div_zero_int <dbl> Inf, Inf, Inf, Inf, NA #> $ dbl_div_zero_dbl <dbl> Inf, Inf, Inf, Inf, NA ``` <sup>Created on 2021-11-09 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup> </details> Closes #11652 from paleolimbot/r-floor-div Authored-by: Dewey Dunnington <[email protected]> Signed-off-by: Jonathan Keane <[email protected]>
1 parent 8aad0d6 commit 2468992

File tree

4 files changed

+69
-7
lines changed

4 files changed

+69
-7
lines changed

r/R/arrow-datum.R

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,24 @@ eval_array_expression <- function(FUN,
102102
args <- map(args, ~ .$cast(float64()))
103103
} else if (FUN == "%/%") {
104104
# In R, integer division works like floor(float division)
105-
out <- eval_array_expression("/", args = args, options = options)
106-
return(out$cast(int32(), allow_float_truncate = TRUE))
105+
out <- eval_array_expression("/", args = args)
106+
107+
# integer output only for all integer input
108+
int_type_ids <- Type[toupper(INTEGER_TYPES)]
109+
numerator_is_int <- args[[1]]$type_id() %in% int_type_ids
110+
denominator_is_int <- args[[2]]$type_id() %in% int_type_ids
111+
112+
if (numerator_is_int && denominator_is_int) {
113+
out_float <- eval_array_expression(
114+
"if_else",
115+
eval_array_expression("equal", args[[2]], 0L),
116+
Scalar$create(NA_integer_),
117+
eval_array_expression("floor", out)
118+
)
119+
return(out_float$cast(args[[1]]$type))
120+
} else {
121+
return(eval_array_expression("floor", out))
122+
}
107123
} else if (FUN == "%%") {
108124
# We can't simply do {e1 - e2 * ( e1 %/% e2 )} since Ops.Array evaluates
109125
# eagerly, but we can build that up

r/R/expression.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,23 @@ build_expr <- function(FUN,
216216
} else if (FUN == "%/%") {
217217
# In R, integer division works like floor(float division)
218218
out <- build_expr("/", args = args)
219-
return(out$cast(int32(), allow_float_truncate = TRUE))
219+
220+
# integer output only for all integer input
221+
int_type_ids <- Type[toupper(INTEGER_TYPES)]
222+
numerator_is_int <- args[[1]]$type_id() %in% int_type_ids
223+
denominator_is_int <- args[[2]]$type_id() %in% int_type_ids
224+
225+
if (numerator_is_int && denominator_is_int) {
226+
out_float <- build_expr(
227+
"if_else",
228+
build_expr("equal", args[[2]], 0L),
229+
Scalar$create(NA_integer_),
230+
build_expr("floor", out)
231+
)
232+
return(out_float$cast(args[[1]]$type()))
233+
} else {
234+
return(build_expr("floor", out))
235+
}
220236
} else if (FUN == "%%") {
221237
return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]]))
222238
}

r/tests/testthat/test-compute-arith.R

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,22 @@ test_that("Multiplication", {
6464
test_that("Division", {
6565
a <- Array$create(c(1:4, NA_integer_))
6666
expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
67-
expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
67+
expect_equal(a %/% 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
68+
expect_equal(a %/% 2, Array$create(c(0, 1, 1, 2, NA_real_)))
69+
expect_equal(a %/% 2L, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
70+
expect_equal(a %/% 0L, Array$create(rep(NA_integer_, 5)))
6871
expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
69-
expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
72+
expect_equal(a %/% 2L %/% 2L, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
7073
expect_equal(a / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
7174
# TODO add tests for integer division %/% by 0
7275
# see https://issues.apache.org/jira/browse/ARROW-14297
7376

7477
b <- a$cast(float64())
7578
expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
76-
expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
79+
expect_equal(b %/% 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
80+
expect_equal(b %/% 0L, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
81+
expect_equal(b %/% 2, Array$create(c(0, 1, 1, 2, NA_real_)))
82+
expect_equal(b %/% 2L, Array$create(c(0, 1, 1, 2, NA_real_)))
7783
expect_equal(b / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
7884
# TODO add tests for integer division %/% by 0
7985
# see https://issues.apache.org/jira/browse/ARROW-14297
@@ -86,7 +92,7 @@ test_that("Division", {
8692
# c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L)
8793
# c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
8894
# nolint end
89-
expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))
95+
expect_equal(b %/% 2.2, Array$create(c(0, 0, 1, 1, NA_integer_)))
9096

9197
expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))
9298

r/tests/testthat/test-dplyr-funcs-math.R

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,3 +307,27 @@ test_that("arith functions ", {
307307
df
308308
)
309309
})
310+
311+
test_that("floor division maintains type consistency with R", {
312+
df <- tibble(
313+
integers = c(1:4, NA_integer_),
314+
doubles = c(as.numeric(1:4), NA_real_)
315+
)
316+
317+
compare_dplyr_binding(
318+
.input %>%
319+
transmute(
320+
int_div_dbl = integers %/% 2,
321+
int_div_int = integers %/% 2L,
322+
int_div_zero_int = integers %/% 0L,
323+
int_div_zero_dbl = integers %/% 0,
324+
325+
dbl_div_dbl = doubles %/% 2,
326+
dbl_div_int = doubles %/% 2L,
327+
dbl_div_zero_int = doubles %/% 0L,
328+
dbl_div_zero_dbl = doubles %/% 0
329+
) %>%
330+
collect(),
331+
df
332+
)
333+
})

0 commit comments

Comments
 (0)