diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 0f7acfce86094..7c10f0eeef74f 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -84,6 +84,24 @@ void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem) return mem; } +void* GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering(void* mem) +{ + computePointerWithAlignment(mem, mNativeClustersBuffer, mNClusterNativeBeforeFiltering); + return mem; +} + +void* GPUTPCDecompression::SetPointersInputClusterNativeAccess(void* mem) +{ + computePointerWithAlignment(mem, mClusterNativeAccess); + return mem; +} + +void* GPUTPCDecompression::SetPointersNClusterPerSectorRow(void* mem) +{ + computePointerWithAlignment(mem, mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT); + return mem; +} + void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); @@ -91,6 +109,9 @@ void GPUTPCDecompression::RegisterMemoryAllocation() mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU"); mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput"); mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput"); + mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering"); + mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering"); + mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index d9871613d8401..47c64008b176e 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -55,6 +55,9 @@ class GPUTPCDecompression : public GPUProcessor void* SetPointersTmpNativeBuffersGPU(void* mem); void* SetPointersTmpNativeBuffersOutput(void* mem); void* SetPointersTmpNativeBuffersInput(void* mem); + void* SetPointersTmpClusterNativeAccessForFiltering(void* mem); + void* SetPointersInputClusterNativeAccess(void* mem); + void* SetPointersNClusterPerSectorRow(void* mem); #endif @@ -63,11 +66,14 @@ class GPUTPCDecompression : public GPUProcessor o2::tpc::CompressedClusters mInputGPU; uint32_t mMaxNativeClustersPerBuffer; + uint32_t mNClusterNativeBeforeFiltering; uint32_t* mNativeClustersIndex; uint32_t* mUnattachedClustersOffsets; uint32_t* mAttachedClustersOffsets; + uint32_t* mNClusterPerSectorRow; o2::tpc::ClusterNative* mTmpNativeClusters; o2::tpc::ClusterNative* mNativeClustersBuffer; + o2::tpc::ClusterNativeAccess* mClusterNativeAccess; template void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA); @@ -75,6 +81,9 @@ class GPUTPCDecompression : public GPUProcessor int16_t mMemoryResInputGPU = -1; int16_t mResourceTmpIndexes = -1; int16_t mResourceTmpClustersOffsets = -1; + int16_t mResourceTmpBufferBeforeFiltering = -1; + int16_t mResourceClusterNativeAccess = -1; + int16_t mResourceNClusterPerSectorRow = -1; }; } // namespace GPUCA_NAMESPACE::gpu #endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 2c88ea0079a26..d7f1e2ac88368 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -43,7 +43,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread 0 ? cl.getTime() < param.tpcCutTimeBin : true; +} + +template <> +GPUdii() void GPUTPCDecompressionUtilKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) +{ + const GPUParam& GPUrestrict() param = processors.param; + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess; + for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) { + uint32_t slice = i / GPUCA_ROW_COUNT; + uint32_t row = i % GPUCA_ROW_COUNT; + for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) { + ClusterNative cl = clusterAccess->clusters[slice][row][k]; + if (isClusterKept(cl, param)) { + decompressor.mNClusterPerSectorRow[i]++; + } + } + } +} + +template <> +GPUdii() void GPUTPCDecompressionUtilKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) +{ + const GPUParam& GPUrestrict() param = processors.param; + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer; + const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess; + const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative; + for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) { + uint32_t slice = i / GPUCA_ROW_COUNT; + uint32_t row = i % GPUCA_ROW_COUNT; + uint32_t count = 0; + for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) { + const ClusterNative cl = clusterAccess->clusters[slice][row][k]; + if (isClusterKept(cl, param)) { + clusterBuffer[outputAccess->clusterOffset[slice][row] + count] = cl; + count++; + } + } + } +} + template <> GPUdii() void GPUTPCDecompressionUtilKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) { diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index 622e1fd984fa7..b45af622ebac8 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -59,11 +59,15 @@ class GPUTPCDecompressionUtilKernels : public GPUKernelTemplate { public: enum K : int32_t { - sortPerSectorRow = 0, + countFilteredClusters = 0, + storeFilteredClusters = 1, + sortPerSectorRow = 2, }; template GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors); + + GPUdi() static bool isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param); }; } // namespace GPUCA_NAMESPACE::gpu diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 970e1b2926853..3852d37f6facf 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -344,6 +344,12 @@ #endif #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow #define GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow 256 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters + #define GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters 256 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters + #define GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters 256 #endif #ifndef GPUCA_LB_GPUTPCCFDecodeZS #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4 diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 8ca3a83e780fb..01e4d011d08b9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -246,6 +246,7 @@ int32_t GPUChainTracking::RunTPCDecompression() mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); RecoStep myStep = RecoStep::TPCDecompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; + bool runFiltering = param().tpcCutTimeBin > 0; GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); @@ -253,6 +254,13 @@ int32_t GPUChainTracking::RunTPCDecompression() CompressedClusters& inputGPU = Decompressor.mInputGPU; CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; + if (cmprClsHost.nTracks && cmprClsHost.solenoidBz != -1e6f && cmprClsHost.solenoidBz != param().bzkG) { + throw std::runtime_error("Configured solenoid Bz does not match value used for track model encoding"); + } + if (cmprClsHost.nTracks && cmprClsHost.maxTimeBin != -1e6 && cmprClsHost.maxTimeBin != param().continuousMaxTimeBin) { + throw std::runtime_error("Configured max time bin does not match value used for track model encoding"); + } + int32_t inputStream = 0; int32_t unattachedStream = mRec->NStreams() - 1; inputGPU = cmprClsHost; @@ -300,12 +308,6 @@ int32_t GPUChainTracking::RunTPCDecompression() GPUMemCpy(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), unattachedStream, toGPU); GPUMemCpy(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), unattachedStream, toGPU); - mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; - AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); - AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); - DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer; - Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; - WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream); TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream, nullptr, mEvents->stream, nStreams); SynchronizeStream(inputStream); uint32_t offset = 0; @@ -324,27 +326,83 @@ int32_t GPUChainTracking::RunTPCDecompression() if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { GPUWarning("%u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); } - if (doGPU) { - mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; + if (runFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context + Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters; + AllocateRegisteredMemory(Decompressor.mResourceTmpBufferBeforeFiltering); + AllocateRegisteredMemory(Decompressor.mResourceClusterNativeAccess); + mClusterNativeAccess->clustersLinear = DecompressorShadow.mNativeClustersBuffer; + mClusterNativeAccess->setOffsetPtrs(); + *Decompressor.mClusterNativeAccess = *mClusterNativeAccess; + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess, inputStream, &mEvents->single); + } else { // If not filtering, directly allocate the final buffers + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer; + Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; + DecompressorShadow.mClusterNativeAccess = mInputsShadow->mPclusterNativeAccess; + Decompressor.mClusterNativeAccess = mInputsHost->mPclusterNativeAccess; + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream); + if (doGPU) { + mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; + mClusterNativeAccess->setOffsetPtrs(); + *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess; + processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; + WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single); + } + mIOPtrs.clustersNative = mClusterNativeAccess.get(); + mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; mClusterNativeAccess->setOffsetPtrs(); *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess; - processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; - WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream); - TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single); } - mIOPtrs.clustersNative = mClusterNativeAccess.get(); - mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; - mClusterNativeAccess->setOffsetPtrs(); uint32_t batchSize = doGPU ? 6 : NSLICES; for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice = iSlice + batchSize) { int32_t iStream = (iSlice / batchSize) % mRec->NStreams(); runKernel({GetGridAuto(iStream), krnlRunRangeNone, {nullptr, &mEvents->single}}, iSlice, batchSize); uint32_t copySize = std::accumulate(mClusterNativeAccess->nClustersSector + iSlice, mClusterNativeAccess->nClustersSector + iSlice + batchSize, 0u); - GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false); + if (!runFiltering) { + GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false); + } } SynchronizeGPU(); + if (runFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly + AllocateRegisteredMemory(Decompressor.mResourceNClusterPerSectorRow); + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream); + runKernel({GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNClusterPerSectorRow[0])); + runKernel(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression)); + TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow, unattachedStream); + SynchronizeStream(unattachedStream); + uint32_t nClustersFinal = std::accumulate(Decompressor.mNClusterPerSectorRow, Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows, 0u); + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClustersFinal; + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer; + Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream); + for (uint32_t i = 0; i < NSLICES; i++) { + for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { + mClusterNativeAccess->nClusters[i][j] = Decompressor.mNClusterPerSectorRow[i * GPUCA_ROW_COUNT + j]; + } + } + if (doGPU) { + mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; + mClusterNativeAccess->setOffsetPtrs(); + *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess; + processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; + WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), unattachedStream); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, unattachedStream); + } + mIOPtrs.clustersNative = mClusterNativeAccess.get(); + mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; + mClusterNativeAccess->setOffsetPtrs(); + runKernel(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression)); + GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput, DecompressorShadow.mNativeClustersBuffer, sizeof(Decompressor.mNativeClustersBuffer[0]) * nClustersFinal, unattachedStream, false); + SynchronizeStream(unattachedStream); + } if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) { runKernel(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression)); const ClusterNativeAccess* decoded = mIOPtrs.clustersNative; @@ -357,6 +415,7 @@ int32_t GPUChainTracking::RunTPCDecompression() } } } + SynchronizeStream(unattachedStream); } mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR")); } diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index b0aed5aba1166..f028c6990f267 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -108,6 +108,8 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" "GPUTPCCom o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" "= TPCDECOMPRESSION" LB simple int32_t trackStart int32_t trackEnd) o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB simple int32_t sliceStart int32_t nSlices) o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow" "GPUTPCDecompressionKernels" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, countFilteredClusters" "GPUTPCDecompressionKernels" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, storeFilteredClusters" "GPUTPCDecompressionKernels" LB simple) o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" "= TPCCLUSTERFINDER" LB single)