From b05afe0f259f4830051088630b976e95c9a01ee3 Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Tue, 11 Nov 2025 10:12:25 +0400
Subject: [PATCH] `map` in `+`/`-` for `Array`s (#59961)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`map` is a simpler operation and uses linear indexing for `Array`s. This
often improves performance (occasionally enabling vectorization) and
improves TTFX in common cases. It also automatically returns the correct
result for 0-D arrays, unlike broadcasting that returns a scalar.

Performance:
```julia
julia> A = ones(3,3);

julia> @btime $A + $A;
  44.622 ns (2 allocations: 144 bytes) # v"1.13.0-DEV.1387"
  29.047 ns (2 allocations: 144 bytes) # this PR

julia> A = ones(3,3000);

julia> @btime $A + $A;
  10.095 μs (3 allocations: 70.40 KiB) # v"1.13.0-DEV.1387"
  4.787 μs (3 allocations: 70.40 KiB) # this PR

julia> @btime A + B + C + D + E + F setup=(A = rand(200,200); B = rand(200,200); C = rand(200,200); D = rand(200,200); E = rand(200,200); F = rand(200,200));
  93.910 μs (3 allocations: 312.59 KiB) # v"1.13.0-DEV.1387"
  64.813 μs (9 allocations: 312.77 KiB) # this PR
```
Similarly for `-`.

TTFX:
```julia
julia> A = ones(3,3);

julia> @time A + A;
  0.174090 seconds (303.47 k allocations: 14.575 MiB, 99.98% compilation time) # v"1.13.0-DEV.1387"
  0.072748 seconds (220.27 k allocations: 11.139 MiB, 99.95% compilation time) # this PR
```

These are measured on
```julia
julia> versioninfo()
Julia Version 1.13.0-DEV.1388
Commit c5f492781e (2025-10-27 11:44 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × Intel(R) Core(TM) i5-10310U CPU @ 1.70GHz
  WORD_SIZE: 64
  LLVM: libLLVM-20.1.8 (ORCJIT, skylake)
  GC: Built with stock GC
Threads: 1 default, 1 interactive, 1 GC (on 8 virtual cores)
Environment:
  LD_LIBRARY_PATH = /usr/local/lib:
  JULIA_EDITOR = subl
```
---
 base/arraymath.jl | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/base/arraymath.jl b/base/arraymath.jl
index 6488b0ae68505..4dcaa3eac8e30 100644
--- a/base/arraymath.jl
+++ b/base/arraymath.jl
@@ -2,10 +2,29 @@
 
 ## Binary arithmetic operators ##
 
+function _broadcast_preserving_zero_d(f, A, B)
+    broadcast_preserving_zero_d(f, A, B)
+end
+
+# Using map over broadcast enables vectorization for wide matrices with few rows.
+# This is because we use linear indexing in `map` as opposed to Cartesian indexing in broadcasting.
+# https://github.com/JuliaLang/julia/issues/47873#issuecomment-1352472461
+function _broadcast_preserving_zero_d(f, A::Array, B::Array)
+    map(f, A, B)
+end
+
+function _broadcast_preserving_zero_d(f, A::Array, B::Number)
+    map(Fix2(f, B), A)
+end
+
+function _broadcast_preserving_zero_d(f, A::Number, B::Array)
+    map(Fix1(f, A), B)
+end
+
 for f in (:+, :-)
     @eval function ($f)(A::AbstractArray, B::AbstractArray)
         promote_shape(A, B) # check size compatibility
-        broadcast_preserving_zero_d($f, A, B)
+        _broadcast_preserving_zero_d($f, A, B)
     end
 end
 
@@ -13,15 +32,15 @@ function +(A::Array, Bs::Array...)
     for B in Bs
         promote_shape(A, B) # check size compatibility
     end
-    broadcast_preserving_zero_d(+, A, Bs...)
+    map(+, A, Bs...)
 end
 
 for f in (:/, :\, :*)
     if f !== :/
-        @eval ($f)(A::Number, B::AbstractArray) = broadcast_preserving_zero_d($f, A, B)
+        @eval ($f)(A::Number, B::AbstractArray) = _broadcast_preserving_zero_d($f, A, B)
     end
     if f !== :\
-        @eval ($f)(A::AbstractArray, B::Number) = broadcast_preserving_zero_d($f, A, B)
+        @eval ($f)(A::AbstractArray, B::Number) = _broadcast_preserving_zero_d($f, A, B)
     end
 end