Skip to content

Performance comparison against KernelAbstractions.jl #50

@mtagliazucchi

Description

@mtagliazucchi

Hi! I'm new to Julia an I'm trying to port a Python code in Julia. My original code is written in JAX, and to mimic the heterogeneous computing capabilities of JAX (CPU+GPU), I'm using AcceleratedKernels.jl and KernelAbstractions.jl.

However, I'm finding performance differences between these two packaeges.
In particular I'm trying to write a simple 1-dimensional linear interpolator (for equally spaced points).

These are the two functions I wrote using AK and KA:

using CUDA
using KernelAbstractions
import AcceleratedKernels as AK
using BenchmarkTools

# KernelAbstractions

@kernel function linear_interp_kernel!(
  results, x_test, x_train, y_train, 
  x_train_first, x_train_last, x_train_step, 
  x_train_size, fill_value_left, fill_value_right)
  
  idx = @index(Global)
  x = x_test[idx]
  
  # Handle edge cases
  if x < x_train_first
    results[idx] = fill_value_left
  elseif x > x_train_last
    results[idx] = fill_value_right
  else
    # Find the indices
    idx_train = (x - x_train_first) / x_train_step
    lower_idx = clamp(floor(Int, idx_train), 1, x_train_size)
    upper_idx = clamp(ceil(Int, idx_train), 1, x_train_size)
    
    # Make sure we don't go out of bounds
    if lower_idx == upper_idx
      results[idx] = y_train[lower_idx]
    else
      # Linear interpolation
      x0 = x_train[lower_idx]
      x1 = x_train[upper_idx]
      y0 = y_train[lower_idx]
      y1 = y_train[upper_idx]
      results[idx] = y0 + (y1 - y0) * (x - x0) / (x1 - x0)
    end
  end
end

function linear_interp_ka(
  x_test::AbstractVecOrMat{T},
  x_train::AbstractVector{T},
  y_train::AbstractMatrix{T},
  fill_value_left::T,
  fill_value_right::T) where {T <: Real}

  results = similar(x_test)
  x_train_size = length(x_train)
  x_train_first = CUDA.@allowscalar x_train[begin]
  x_train_last = CUDA.@allowscalar x_train[end]
  x_train_step = (x_train_last - x_train_first)/(x_train_size-1)

  backend = get_backend(x_test)
  kernel! = linear_interp_kernel!(backend)
  kernel!(
    results, x_test, x_train, y_train,
    x_train_first, x_train_last, x_train_step,
    x_train_size, fill_value_left, fill_value_right,
    ndrange=length(x_test)
  )
  synchronize(backend)
  return results
end

# AcceleratedKernels

function linear_interp_ak(
  x_test::AbstractVecOrMat{T},
  x_train::AbstractVector{T},
  y_train::AbstractMatrix{T},
  fill_value_left::T,
  fill_value_right::T) where {T <: Real}

  # Allocate results and precompute useful quantities
  results = similar(x_test)
  x_train_size = length(x_train)
  x_train_first = CUDA.@allowscalar x_train[begin] #  x_train_first = x_train[begin] gives error here
  x_train_last =  CUDA.@allowscalar x_train[end] # CUDA.@allowscalar x_train_last =  x_train[end] gives error here
  x_train_step = (x_train_last - x_train_first)/(x_train_size-1)

  # "Kernel" loop
  AK.foreachindex(x_test) do i
    # Handle edge cases
    if x_test[i] < x_train_first
      results[i] = fill_value_left
    elseif x_test[i] > x_train_last
      results[i] = fill_value_right
    else
      # Find the indices
      idx = (x_test[i] - x_train_first) / x_train_step 
      lower_idx = clamp(floor(Int, idx), 1, x_train_size) 
      upper_idx = clamp(ceil(Int, idx), 1, x_train_size)
      # Make sure we don't go out of bounds
      if lower_idx==upper_idx 
        results[i] = y_train[lower_idx]
      else
        # Linear interpolation
        x0 = x_train[lower_idx]
        x1 = x_train[upper_idx]
        y0 = y_train[lower_idx]
        y1 = y_train[upper_idx]
        results[i] = y0 + (y1 - y0) * (x_test[i] - x0) / (x1 - x0)
      end
    end
  end
  return results
end

# Test
train_size = 300
test_size = 5000
batch_size = 500
x_train = collect(range(0.01, 3.15, train_size)) |> CuArray
y_train = sin.(x_train)
x_test = range(0., 3.14, test_size) |> collect
x_test_matrix = repeat(x_test', batch_size) |> CuArray

The function that uses KA on the GPU provided by Google Colab (free tier) is

# Warmup and benchmark
ka_res = linear_interp_ka(x_test_matrix, x_train, y_train, 0.0, 0.0)
@benchmark @CUDA.sync linear_interp_ka($x_test_matrix, $x_train, $y_train, 0.0, 0.0)

BenchmarkTools.Trial: 4924 samples with 1 evaluation per sample.
 Range (min … max):  950.619 μs …  21.841 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     972.237 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):     1.008 ms ± 522.756 μs  ┊ GC (mean ± σ):  0.68% ± 3.26%

     ▁▅▆▆█▆▆▃▂▁                                                  
  ▂▂▄███████████▆▆▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▂▁▂▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂ ▃
  951 μs           Histogram: frequency by time          1.1 ms <

 Memory estimate: 2.72 KiB, allocs estimate: 112.

while the AK function performs worst

# Warmup and benchmark
ak_res = linear_interp_ak(x_test_matrix, x_train, y_train, 0.0, 0.0)
@benchmark @CUDA.sync linear_interp_ak($x_test_matrix, $x_train, $y_train, 0.0, 0.0)

BenchmarkTools.Trial: 1308 samples with 1 evaluation per sample.
 Range (min … max):  3.295 ms … 22.485 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.339 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.813 ms ±  1.925 ms  ┊ GC (mean ± σ):  0.17% ± 2.78%

  █▁                  ▁                                       
  ██▄▅▅▅▅▅▄▅▁▅▁▃▆▃▁▅▃▅█▃▄▁▁▆▃▄▃▃▃▁▃▁▄▃▄▃▁▁▁▁▁▁▃▃▃▁▁▁▁▃▄▁▁▃▁▄ ▇
  3.29 ms      Histogram: log(frequency) by time     13.9 ms <

 Memory estimate: 4.14 KiB, allocs estimate: 117.

The results are the same: ka_res == ak_res = true

The AK code is shorter and clearer, but I don't understand why it is ~3-4 slower than the corresponding KA code.
Does someone have an explanation of what's going on? Thanks!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions