Skip to content

Commit 0911adc

Browse files
committed
[L0v2] add submitted kernel vector compaction
L0v2 avoids internally tracking each kernel submission through an event for lifetime management. Instead, when a kernel is submitted to the queue, its handle is added to a vector, to be removed at the next queue synchronization point, urQueueFinish(). This is a much more efficient way of handling kernel tracking, since it avoids taking and storing an event. However, if the application never synchronizes the queue, this vector of submitted kernels will grow unbounded. This patch avoids this problem by dynamically compacting the submitted kernel vector at set intervals, deduplicating identical kernel handles. The larger the amount of unique kernels, the larger the vector will be.
1 parent 7b05a8c commit 0911adc

File tree

3 files changed

+177
-0
lines changed

3 files changed

+177
-0
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
2+
// RUN: %{build} -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
#include <array>
6+
#include <cassert>
7+
#include <cstdint>
8+
#include <sycl/sycl.hpp>
9+
#include <vector>
10+
11+
static constexpr std::size_t kUniqueKernels = 256;
12+
static constexpr std::size_t kConsecutiveDupSubmissions =
13+
5000; // same kernel over and over
14+
static constexpr std::size_t kCyclicSubmissions =
15+
8000; // cycle over small subset
16+
static constexpr std::size_t kCyclicSubset = 16; // cycle kernel subset
17+
static constexpr std::size_t kAllKernelsSubmissions =
18+
10000; // running all kernel
19+
20+
template <int ID> struct KernelTag;
21+
22+
template <int ID> static void submit_increment(sycl::queue &Q, int *accum) {
23+
Q.submit([&](sycl::handler &CGH) {
24+
CGH.single_task<KernelTag<ID>>([=]() {
25+
// atomic_ref to avoid data races while we spam submissions.
26+
sycl::atomic_ref<int, sycl::memory_order::relaxed,
27+
sycl::memory_scope::device>
28+
ref(accum[ID]);
29+
ref.fetch_add(1);
30+
});
31+
});
32+
}
33+
34+
using SubmitFn = void (*)(sycl::queue &, int *);
35+
36+
template <std::size_t... Is>
37+
static auto make_fn_table(std::index_sequence<Is...>) {
38+
return std::array<SubmitFn, kUniqueKernels>{
39+
&submit_increment<static_cast<int>(Is)>...};
40+
}
41+
42+
int main() {
43+
sycl::queue Q;
44+
45+
int *accum = sycl::malloc_shared<int>(kUniqueKernels, Q);
46+
assert(accum && "USM alloc failed");
47+
for (std::size_t i = 0; i < kUniqueKernels; ++i)
48+
accum[i] = 0;
49+
50+
std::vector<std::size_t> expected(kUniqueKernels, 0);
51+
52+
auto fns = make_fn_table(std::make_index_sequence<kUniqueKernels>{});
53+
54+
// Submit the same kernel over and over again. The submitted kernel
55+
// vector shouldn't grow at all, since we do a lookback over
56+
// a few previous kernels.
57+
auto runDuplicates = [&]() {
58+
for (size_t i = 0; i < kConsecutiveDupSubmissions; ++i) {
59+
fns[0](Q, accum);
60+
expected[0]++;
61+
}
62+
};
63+
64+
// Run a small subset of kernels in a loop. Likely the most realistic
65+
// scenario. Should be mostly absorbed by loopback duplicate search, and,
66+
// possibliy, compaction.
67+
auto runCyclical = [&]() {
68+
for (size_t i = 0; i < kCyclicSubmissions; ++i) {
69+
size_t id = i % kCyclicSubset;
70+
fns[id](Q, accum);
71+
expected[id]++;
72+
}
73+
};
74+
75+
// Run all kernels in the loop. Should dynamically adjust the
76+
// threshold for submitted kernels.
77+
auto runAll = [&]() {
78+
for (size_t i = 0; i < kAllKernelsSubmissions; ++i) {
79+
size_t id = i % kUniqueKernels;
80+
fns[id](Q, accum);
81+
expected[id]++;
82+
}
83+
};
84+
85+
// Run from small kernel variety, to large, to small, to test dynamic
86+
// threshold changes.
87+
runDuplicates();
88+
runCyclical();
89+
runAll();
90+
Q.wait(); // this clears the submitted kernels list, allowing the threshold to
91+
// lower.
92+
runCyclical();
93+
runDuplicates();
94+
95+
Q.wait();
96+
97+
bool ok = true;
98+
for (std::size_t i = 0; i < kUniqueKernels; ++i) {
99+
if (static_cast<std::size_t>(accum[i]) != expected[i]) {
100+
ok = false;
101+
std::cout << "fail: " << accum[i] << " != " << expected[i] << "\n";
102+
}
103+
}
104+
105+
sycl::free(accum, Q);
106+
return ok ? 0 : 1;
107+
}

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,59 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
167167

168168
void ur_queue_immediate_in_order_t::recordSubmittedKernel(
169169
ur_kernel_handle_t hKernel) {
170+
171+
bool isDuplicate = std::any_of(
172+
submittedKernels.end() -
173+
std::min(SUBMITTED_KERNELS_DUPE_CHECK_DEPTH, submittedKernels.size()),
174+
submittedKernels.end(), [hKernel](auto k) { return k == hKernel; });
175+
176+
if (isDuplicate) {
177+
return;
178+
}
179+
180+
if (submittedKernels.size() > compactionThreshold) {
181+
compactSubmittedKernels();
182+
}
183+
170184
submittedKernels.push_back(hKernel);
171185
hKernel->RefCount.increment();
172186
}
173187

188+
void ur_queue_immediate_in_order_t::compactSubmittedKernels() {
189+
size_t beforeSize = submittedKernels.size();
190+
191+
std::sort(submittedKernels.begin(), submittedKernels.end());
192+
193+
// Remove all but one unique entry for each kernel. All removed entries
194+
// need to have their refcounts decremented.
195+
auto newEnd = std::unique(
196+
submittedKernels.begin(), submittedKernels.end(), [](auto lhs, auto rhs) {
197+
if (lhs == rhs) {
198+
const bool lastEntry = rhs->RefCount.decrementAndTest();
199+
assert(!lastEntry); // there should be at least one entry left.
200+
return true; // duplicate.
201+
}
202+
return false;
203+
});
204+
205+
submittedKernels.erase(newEnd, submittedKernels.end());
206+
207+
// Adjust compaction threshold.
208+
size_t removed = beforeSize - submittedKernels.size();
209+
size_t removedPct = beforeSize > 0 ? (removed * 100) / beforeSize : 0;
210+
if (removedPct > 75) {
211+
// We removed a lot of entries. Lower the threshold if possible.
212+
compactionThreshold = std::max<std::size_t>(
213+
SUBMITTED_KERNELS_DEFAULT_THRESHOLD, compactionThreshold / 2);
214+
} else if (removedPct < 10 &&
215+
compactionThreshold < SUBMITTED_KERNELS_MAX_THRESHOLD) {
216+
// Increase the threshold if we removed very little entries. This means
217+
// there are many unique kernels, and we need to allow the vector to grow
218+
// more.
219+
compactionThreshold *= 2;
220+
}
221+
}
222+
174223
ur_result_t ur_queue_immediate_in_order_t::queueFlush() {
175224
return UR_RESULT_SUCCESS;
176225
}

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,24 @@ namespace v2 {
2727

2828
using queue_group_type = ur_device_handle_t_::queue_group_info_t::type;
2929

30+
// When recording submitted kernels, we only care about unique kernels. It's not
31+
// important whether the kernel has been submitted to the kernel just once or
32+
// dozens of times. The number of unique kernels should be fairly low.
33+
// So, in order to reduce the number of entries in the submitted kernels vector,
34+
// we do a lookback at 4 previous entries (to try to keep within a cacheline),
35+
// and don't record a new kernel if it exists.
36+
static const size_t SUBMITTED_KERNELS_DUPE_CHECK_DEPTH = 4;
37+
38+
// In scenarios where queue synchronization happens rarely, the submitted kernel
39+
// vector can grow unbounded. In order to avoid that, we go through the entire
40+
// vector, eliminating any duplicates.
41+
static const size_t SUBMITTED_KERNELS_DEFAULT_THRESHOLD = 128;
42+
43+
// If we reach this many unique kernels, the application is probably doing
44+
// something incorrectly. The adapter will still function, just that compaction
45+
// will happen more frequently.
46+
static const size_t SUBMITTED_KERNELS_MAX_THRESHOLD = 65536;
47+
3048
struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
3149
private:
3250
ur_context_handle_t hContext;
@@ -35,6 +53,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
3553

3654
lockable<ur_command_list_manager> commandListManager;
3755
std::vector<ur_kernel_handle_t> submittedKernels;
56+
std::size_t compactionThreshold = SUBMITTED_KERNELS_DEFAULT_THRESHOLD;
3857

3958
wait_list_view
4059
getWaitListView(locked<ur_command_list_manager> &commandList,
@@ -64,6 +83,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
6483

6584
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
6685

86+
void compactSubmittedKernels();
87+
6788
public:
6889
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
6990
const ur_queue_properties_t *);

0 commit comments

Comments
 (0)