Skip to content

Commit 9b9c33d

Browse files
committed
[L0v2] add submitted kernel vector compaction
1 parent 7b05a8c commit 9b9c33d

File tree

3 files changed

+176
-0
lines changed

3 files changed

+176
-0
lines changed
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
2+
// RUN: %{build} -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
#include <array>
6+
#include <cassert>
7+
#include <cstdint>
8+
#include <sycl/sycl.hpp>
9+
#include <vector>
10+
11+
static constexpr std::size_t kUniqueKernels = 256;
12+
static constexpr std::size_t kConsecutiveDupSubmissions =
13+
5000; // same kernel over and over
14+
static constexpr std::size_t kCyclicSubmissions =
15+
8000; // cycle over small subset
16+
static constexpr std::size_t kCyclicSubset = 16; // cycle kernel subset
17+
static constexpr std::size_t kAllKernelsSubmissions =
18+
10000; // running all kernel
19+
20+
template <int ID> struct KernelTag;
21+
22+
template <int ID> static void submit_increment(sycl::queue &Q, int *accum) {
23+
Q.submit([&](sycl::handler &CGH) {
24+
CGH.single_task<KernelTag<ID>>([=]() {
25+
// atomic_ref to avoid data races while we spam submissions.
26+
sycl::atomic_ref<int, sycl::memory_order::relaxed,
27+
sycl::memory_scope::device>
28+
ref(accum[ID]);
29+
ref.fetch_add(1);
30+
});
31+
});
32+
}
33+
34+
using SubmitFn = void (*)(sycl::queue &, int *);
35+
36+
template <std::size_t... Is>
37+
static auto make_fn_table(std::index_sequence<Is...>) {
38+
return std::array<SubmitFn, kUniqueKernels>{
39+
&submit_increment<static_cast<int>(Is)>...};
40+
}
41+
42+
int main() {
43+
sycl::queue Q;
44+
45+
int *accum = sycl::malloc_shared<int>(kUniqueKernels, Q);
46+
assert(accum && "USM alloc failed");
47+
for (std::size_t i = 0; i < kUniqueKernels; ++i)
48+
accum[i] = 0;
49+
50+
std::vector<std::size_t> expected(kUniqueKernels, 0);
51+
52+
auto fns = make_fn_table(std::make_index_sequence<kUniqueKernels>{});
53+
54+
// Submit the same kernel over and over again. The submitted kernel
55+
// vector shouldn't grow at all, since we do a lookback over
56+
// a few previous kernels.
57+
auto runDuplicates = [&]() {
58+
for (size_t i = 0; i < kConsecutiveDupSubmissions; ++i) {
59+
fns[0](Q, accum);
60+
expected[0]++;
61+
}
62+
};
63+
64+
// Run a small subset of kernels in a loop. Likely the most realistic
65+
// scenario. Should be mostly absorbed by loopback duplicate search, and,
66+
// possibliy, compaction.
67+
auto runCyclical = [&]() {
68+
for (size_t i = 0; i < kCyclicSubmissions; ++i) {
69+
size_t id = i % kCyclicSubset;
70+
fns[id](Q, accum);
71+
expected[id]++;
72+
}
73+
};
74+
75+
// Run all kernels in the loop. Should dynamically adjust the
76+
// threshold for submitted kernels.
77+
auto runAll = [&]() {
78+
for (size_t i = 0; i < kAllKernelsSubmissions; ++i) {
79+
size_t id = i % kUniqueKernels;
80+
fns[id](Q, accum);
81+
expected[id]++;
82+
}
83+
};
84+
85+
// Run from small kernel variety, to large, to small, to test dynamic
86+
// threshold changes.
87+
runDuplicates();
88+
runCyclical();
89+
runAll();
90+
runCyclical();
91+
runDuplicates();
92+
93+
Q.wait();
94+
95+
bool ok = true;
96+
for (std::size_t i = 0; i < kUniqueKernels; ++i) {
97+
if (static_cast<std::size_t>(accum[i]) != expected[i]) {
98+
ok = false;
99+
std::cout << "fail: " << accum[i] << " != " << expected[i] << "\n";
100+
}
101+
}
102+
103+
sycl::free(accum, Q);
104+
return ok ? 0 : 1;
105+
}

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,60 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
167167

168168
void ur_queue_immediate_in_order_t::recordSubmittedKernel(
169169
ur_kernel_handle_t hKernel) {
170+
171+
bool isDuplicate = std::any_of(
172+
submittedKernels.end() -
173+
std::min(SUBMITTED_KERNELS_DUPE_CHECK_DEPTH, submittedKernels.size()),
174+
submittedKernels.end(), [hKernel](auto k) { return k == hKernel; });
175+
176+
if (isDuplicate) {
177+
return;
178+
}
179+
180+
if (submittedKernels.size() > compactionThreshold) {
181+
compactSubmittedKernels();
182+
}
183+
170184
submittedKernels.push_back(hKernel);
171185
hKernel->RefCount.increment();
172186
}
173187

188+
void ur_queue_immediate_in_order_t::compactSubmittedKernels() {
189+
size_t beforeSize = submittedKernels.size();
190+
191+
std::sort(submittedKernels.begin(), submittedKernels.end());
192+
193+
// Go through the vector, only keeping a single entry for each unique kernel
194+
// handle. For any duplicates, decrement the refcount.
195+
size_t write = 0;
196+
for (size_t read = 0; read < submittedKernels.size(); ++read) {
197+
if (write == 0 || submittedKernels[read] != submittedKernels[write - 1]) {
198+
submittedKernels[write++] = submittedKernels[read];
199+
} else {
200+
bool lastEntry = submittedKernels[read]->RefCount.decrementAndTest();
201+
assert(!lastEntry);
202+
}
203+
}
204+
205+
submittedKernels.erase(submittedKernels.begin() + write,
206+
submittedKernels.end());
207+
208+
// Adjust compaction threshold.
209+
size_t removed = beforeSize - write;
210+
size_t removedPct = beforeSize > 0 ? (removed * 100) / beforeSize : 0;
211+
if (removedPct > 75) {
212+
// We removed a lot of entries. Lower the threshold if possible.
213+
compactionThreshold = std::max<std::size_t>(
214+
SUBMITTED_KERNELS_DEFAULT_THRESHOLD, compactionThreshold / 2);
215+
} else if (removedPct < 10 &&
216+
compactionThreshold < SUBMITTED_KERNELS_MAX_THRESHOLD) {
217+
// Increase the threshold if we removed very little entries. This means
218+
// there are many unique kernels, and we need to allow the vector to grow
219+
// more.
220+
compactionThreshold *= 2;
221+
}
222+
}
223+
174224
ur_result_t ur_queue_immediate_in_order_t::queueFlush() {
175225
return UR_RESULT_SUCCESS;
176226
}

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,24 @@ namespace v2 {
2727

2828
using queue_group_type = ur_device_handle_t_::queue_group_info_t::type;
2929

30+
// When recording submitted kernels, we only care about unique kernels. It's not
31+
// important whether the kernel has been submitted to the kernel just once or
32+
// dozens of times. The number of unique kernels should be fairly low.
33+
// So, in order to reduce the number of entries in the submitted kernels vector,
34+
// we do a lookback at 4 previous entries (to try to keep within a cacheline),
35+
// and don't record a new kernel if it exists.
36+
static const size_t SUBMITTED_KERNELS_DUPE_CHECK_DEPTH = 4;
37+
38+
// In scenarios where queue synchronization happens rarely, the submitted kernel
39+
// vector can grow unbounded. In order to avoid that, we go through the entire
40+
// vector, eliminating any duplicates.
41+
static const size_t SUBMITTED_KERNELS_DEFAULT_THRESHOLD = 128;
42+
43+
// If we reach this many unique kernels, the application is probably doing
44+
// something incorrectly. The adapter will still function, just that compaction
45+
// will happen more frequently.
46+
static const size_t SUBMITTED_KERNELS_MAX_THRESHOLD = 65536;
47+
3048
struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
3149
private:
3250
ur_context_handle_t hContext;
@@ -35,6 +53,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
3553

3654
lockable<ur_command_list_manager> commandListManager;
3755
std::vector<ur_kernel_handle_t> submittedKernels;
56+
std::size_t compactionThreshold = SUBMITTED_KERNELS_DEFAULT_THRESHOLD;
3857

3958
wait_list_view
4059
getWaitListView(locked<ur_command_list_manager> &commandList,
@@ -64,6 +83,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
6483

6584
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
6685

86+
void compactSubmittedKernels();
87+
6788
public:
6889
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
6990
const ur_queue_properties_t *);

0 commit comments

Comments
 (0)