[L0v2] add submitted kernel vector compaction

pbalcer · pbalcer · commit b7c2fd919018 · 2025-12-09T19:28:09.000Z
L0v2 avoids internally tracking each kernel submission through an
event for lifetime management. Instead, when a kernel is submitted to
the queue, its handle is added to a vector, to be removed at the next
queue synchronization point, urQueueFinish(). This is a much more
efficient way of handling kernel tracking, since it avoids taking and
storing an event. However, if the application never synchronizes the
queue, this vector of submitted kernels will grow unbounded.

This patch avoids this problem by dynamically compacting the submitted
kernel vector at set intervals, deduplicating identical kernel
handles. The larger the amount of unique kernels, the larger the
vector will be.
diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp
@@ -1059,8 +1059,56 @@ ur_result_t ur_command_list_manager::appendNativeCommandExp(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+void ur_command_list_manager::compactSubmittedKernels() {
+  size_t beforeSize = submittedKernels.size();
+
+  std::sort(submittedKernels.begin(), submittedKernels.end());
+
+  // Remove all but one unique entry for each kernel. All removed entries
+  // need to have their refcounts decremented.
+  auto newEnd = std::unique(
+      submittedKernels.begin(), submittedKernels.end(), [](auto lhs, auto rhs) {
+        if (lhs == rhs) {
+          [[maybe_unused]] const bool lastEntry = rhs->RefCount.release();
+          assert(!lastEntry); // there should be at least one entry left.
+          return true;        // duplicate.
+        }
+        return false;
+      });
+
+  submittedKernels.erase(newEnd, submittedKernels.end());
+
+  // Adjust compaction threshold.
+  size_t removed = beforeSize - submittedKernels.size();
+  size_t removedPct = beforeSize > 0 ? (removed * 100) / beforeSize : 0;
+  if (removedPct > 75) {
+    // We removed a lot of entries. Lower the threshold if possible.
+    compactionThreshold = std::max<std::size_t>(
+        SUBMITTED_KERNELS_DEFAULT_THRESHOLD, compactionThreshold / 2);
+  } else if (removedPct < 10 &&
+             compactionThreshold < SUBMITTED_KERNELS_MAX_THRESHOLD) {
+    // Increase the threshold if we removed very little entries. This means
+    // there are many unique kernels, and we need to allow the vector to grow
+    // more.
+    compactionThreshold *= 2;
+  }
+}
+
 void ur_command_list_manager::recordSubmittedKernel(
     ur_kernel_handle_t hKernel) {
+  bool isDuplicate = std::any_of(
+      submittedKernels.end() -
+          std::min(SUBMITTED_KERNELS_DUPE_CHECK_DEPTH, submittedKernels.size()),
+      submittedKernels.end(), [hKernel](auto k) { return k == hKernel; });
+
+  if (isDuplicate) {
+    return;
+  }
+
+  if (submittedKernels.size() > compactionThreshold) {
+    compactSubmittedKernels();
+  }
+
   submittedKernels.push_back(hKernel);
   hKernel->RefCount.retain();
 }
diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp
@@ -45,6 +45,24 @@ struct wait_list_view {
   }
 };
 
+// When recording submitted kernels, we only care about unique kernels. It's not
+// important whether the kernel has been submitted to the kernel just once or
+// dozens of times. The number of unique kernels should be fairly low.
+// So, in order to reduce the number of entries in the submitted kernels vector,
+// we do a lookback at 4 previous entries (to try to keep within a cacheline),
+// and don't record a new kernel if it exists.
+static const size_t SUBMITTED_KERNELS_DUPE_CHECK_DEPTH = 4;
+
+// In scenarios where queue synchronization happens rarely, the submitted kernel
+// vector can grow unbounded. In order to avoid that, we go through the entire
+// vector, eliminating any duplicates.
+static const size_t SUBMITTED_KERNELS_DEFAULT_THRESHOLD = 128;
+
+// If we reach this many unique kernels, the application is probably doing
+// something incorrectly. The adapter will still function, just that compaction
+// will happen more frequently.
+static const size_t SUBMITTED_KERNELS_MAX_THRESHOLD = 65536;
+
 struct ur_command_list_manager {
   ur_command_list_manager(ur_context_handle_t context,
                           ur_device_handle_t device,
@@ -254,6 +272,7 @@ struct ur_command_list_manager {
       ur_command_t callerCommand);
 
   void recordSubmittedKernel(ur_kernel_handle_t hKernel);
+  void compactSubmittedKernels();
 
   ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent,
                                    ur_command_t commandType);
@@ -299,6 +318,8 @@ struct ur_command_list_manager {
   v2::raii::ur_device_handle_t hDevice;
 
   std::vector<ur_kernel_handle_t> submittedKernels;
+  std::size_t compactionThreshold = SUBMITTED_KERNELS_DEFAULT_THRESHOLD;
+
   v2::raii::command_list_unique_handle zeCommandList;
   std::vector<ze_event_handle_t> waitList;
 };