-
Notifications
You must be signed in to change notification settings - Fork 7
Description
Hi! I'm new to Julia an I'm trying to port a Python code in Julia. My original code is written in JAX, and to mimic the heterogeneous computing capabilities of JAX (CPU+GPU), I'm using AcceleratedKernels.jl and KernelAbstractions.jl.
However, I'm finding performance differences between these two packaeges.
In particular I'm trying to write a simple 1-dimensional linear interpolator (for equally spaced points).
These are the two functions I wrote using AK and KA:
using CUDA
using KernelAbstractions
import AcceleratedKernels as AK
using BenchmarkTools
# KernelAbstractions
@kernel function linear_interp_kernel!(
results, x_test, x_train, y_train,
x_train_first, x_train_last, x_train_step,
x_train_size, fill_value_left, fill_value_right)
idx = @index(Global)
x = x_test[idx]
# Handle edge cases
if x < x_train_first
results[idx] = fill_value_left
elseif x > x_train_last
results[idx] = fill_value_right
else
# Find the indices
idx_train = (x - x_train_first) / x_train_step
lower_idx = clamp(floor(Int, idx_train), 1, x_train_size)
upper_idx = clamp(ceil(Int, idx_train), 1, x_train_size)
# Make sure we don't go out of bounds
if lower_idx == upper_idx
results[idx] = y_train[lower_idx]
else
# Linear interpolation
x0 = x_train[lower_idx]
x1 = x_train[upper_idx]
y0 = y_train[lower_idx]
y1 = y_train[upper_idx]
results[idx] = y0 + (y1 - y0) * (x - x0) / (x1 - x0)
end
end
end
function linear_interp_ka(
x_test::AbstractVecOrMat{T},
x_train::AbstractVector{T},
y_train::AbstractMatrix{T},
fill_value_left::T,
fill_value_right::T) where {T <: Real}
results = similar(x_test)
x_train_size = length(x_train)
x_train_first = CUDA.@allowscalar x_train[begin]
x_train_last = CUDA.@allowscalar x_train[end]
x_train_step = (x_train_last - x_train_first)/(x_train_size-1)
backend = get_backend(x_test)
kernel! = linear_interp_kernel!(backend)
kernel!(
results, x_test, x_train, y_train,
x_train_first, x_train_last, x_train_step,
x_train_size, fill_value_left, fill_value_right,
ndrange=length(x_test)
)
synchronize(backend)
return results
end
# AcceleratedKernels
function linear_interp_ak(
x_test::AbstractVecOrMat{T},
x_train::AbstractVector{T},
y_train::AbstractMatrix{T},
fill_value_left::T,
fill_value_right::T) where {T <: Real}
# Allocate results and precompute useful quantities
results = similar(x_test)
x_train_size = length(x_train)
x_train_first = CUDA.@allowscalar x_train[begin] # x_train_first = x_train[begin] gives error here
x_train_last = CUDA.@allowscalar x_train[end] # CUDA.@allowscalar x_train_last = x_train[end] gives error here
x_train_step = (x_train_last - x_train_first)/(x_train_size-1)
# "Kernel" loop
AK.foreachindex(x_test) do i
# Handle edge cases
if x_test[i] < x_train_first
results[i] = fill_value_left
elseif x_test[i] > x_train_last
results[i] = fill_value_right
else
# Find the indices
idx = (x_test[i] - x_train_first) / x_train_step
lower_idx = clamp(floor(Int, idx), 1, x_train_size)
upper_idx = clamp(ceil(Int, idx), 1, x_train_size)
# Make sure we don't go out of bounds
if lower_idx==upper_idx
results[i] = y_train[lower_idx]
else
# Linear interpolation
x0 = x_train[lower_idx]
x1 = x_train[upper_idx]
y0 = y_train[lower_idx]
y1 = y_train[upper_idx]
results[i] = y0 + (y1 - y0) * (x_test[i] - x0) / (x1 - x0)
end
end
end
return results
end
# Test
train_size = 300
test_size = 5000
batch_size = 500
x_train = collect(range(0.01, 3.15, train_size)) |> CuArray
y_train = sin.(x_train)
x_test = range(0., 3.14, test_size) |> collect
x_test_matrix = repeat(x_test', batch_size) |> CuArray
The function that uses KA on the GPU provided by Google Colab (free tier) is
# Warmup and benchmark
ka_res = linear_interp_ka(x_test_matrix, x_train, y_train, 0.0, 0.0)
@benchmark @CUDA.sync linear_interp_ka($x_test_matrix, $x_train, $y_train, 0.0, 0.0)
BenchmarkTools.Trial: 4924 samples with 1 evaluation per sample.
Range (min … max): 950.619 μs … 21.841 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 972.237 μs ┊ GC (median): 0.00%
Time (mean ± σ): 1.008 ms ± 522.756 μs ┊ GC (mean ± σ): 0.68% ± 3.26%
▁▅▆▆█▆▆▃▂▁
▂▂▄███████████▆▆▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▂▁▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂ ▃
951 μs Histogram: frequency by time 1.1 ms <
Memory estimate: 2.72 KiB, allocs estimate: 112.
while the AK function performs worst
# Warmup and benchmark
ak_res = linear_interp_ak(x_test_matrix, x_train, y_train, 0.0, 0.0)
@benchmark @CUDA.sync linear_interp_ak($x_test_matrix, $x_train, $y_train, 0.0, 0.0)
BenchmarkTools.Trial: 1308 samples with 1 evaluation per sample.
Range (min … max): 3.295 ms … 22.485 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 3.339 ms ┊ GC (median): 0.00%
Time (mean ± σ): 3.813 ms ± 1.925 ms ┊ GC (mean ± σ): 0.17% ± 2.78%
█▁ ▁
██▄▅▅▅▅▅▄▅▁▅▁▃▆▃▁▅▃▅█▃▄▁▁▆▃▄▃▃▃▁▃▁▄▃▄▃▁▁▁▁▁▁▃▃▃▁▁▁▁▃▄▁▁▃▁▄ ▇
3.29 ms Histogram: log(frequency) by time 13.9 ms <
Memory estimate: 4.14 KiB, allocs estimate: 117.
The results are the same: ka_res == ak_res = true
The AK code is shorter and clearer, but I don't understand why it is ~3-4 slower than the corresponding KA code.
Does someone have an explanation of what's going on? Thanks!