Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions zstd/zstdgpu/Shaders/ZstdGpuComputePrefixSum.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
struct Consts
{
uint32_t elemToPrefixCount;
uint32_t literalsPerGroup;
};

ConstantBuffer<Consts> Constants : register(b0);
Expand All @@ -37,7 +38,7 @@ RWStructuredBuffer<uint32_t> ZstdLitGroupCountToPrefixLookback : register(u

RWStructuredBuffer<uint32_t> ZstdCounters : register(u4);

[RootSignature("UAV(u0), UAV(u1), UAV(u2), UAV(u3), UAV(u4), RootConstants(b0, num32BitConstants=1)")]
[RootSignature("UAV(u0), UAV(u1), UAV(u2), UAV(u3), UAV(u4), RootConstants(b0, num32BitConstants=2)")]
[numthreads(kzstdgpu_TgSizeX_PrefixSum_LiteralCount, 1, 1)]
void main(uint i : SV_DispatchThreadId)
{
Expand All @@ -51,7 +52,7 @@ void main(uint i : SV_DispatchThreadId)
const uint32_t lastLocalIndex = WaveActiveCountBits(true) - 1u;

const uint32_t streamCount = ZstdLitStreamCountToPrefix[i];
const uint32_t groupCount = ZSTDGPU_TG_COUNT(streamCount, kzstdgpu_TgSizeX_DecompressLiterals);
const uint32_t groupCount = ZSTDGPU_TG_COUNT(streamCount, Constants.literalsPerGroup);

const uint32_t streamPrefix = WavePrefixSum(streamCount);
const uint32_t groupPrefix = WavePrefixSum(groupCount);
Expand Down
159 changes: 159 additions & 0 deletions zstd/zstdgpu/Shaders/ZstdGpuDecompressLiterals_LdsStoreCache.hlsli
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache.hlsli
*
* A compute shader that decompresses Huffman-compressed literals using
* an LDS store cache for cooperative dword-aligned writes.
*
* The Huffman table is packed: two symbol+bitcnt pairs per dword (each pair
* is 16 bits: 8-bit symbol | 8-bit bitcnt). Decoded literals are first
* accumulated into dwords, staged in an LDS cache, and then cooperatively
* flushed to device memory via a dword-typed UAV.
*
* The following must be defined before including this file:
* 'kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache'
* -- threadgroup size, also used as
* the number of dwords cached in LDS
* per decoded literal stream.
* 'kzstdgpu_DecompressLiterals_StreamsPerGroup' -- number of literal streams processed
* per threadgroup.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#ifdef __XBOX_SCARLETT
#define __XBOX_ENABLE_WAVE32 1
#endif

#ifndef kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache
# error 'kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache' must be defined before including this '.hlsli'
#endif

#ifndef kzstdgpu_DecompressLiterals_StreamsPerGroup
# error 'kzstdgpu_DecompressLiterals_StreamsPerGroup' must be defined before including this '.hlsli'
#endif

#include "../zstdgpu_shaders.h"

// LDS layout for the LdsStoreCache variant: Huffman table + per-stream store cache.
// Cache size per stream equals the threadgroup size (kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache).
#define ZSTDGPU_DECOMPRESS_LITERALS_LDS_STORE_CACHE(base, size) \
ZSTDGPU_LDS_SIZE(size) \
ZSTDGPU_LDS_BASE(base) \
ZSTDGPU_LDS_REGION(HuffmanTable, 1u << (kzstdgpu_MaxCount_HuffmanWeightBits - 1)) \
ZSTDGPU_LDS_REGION(LiteralStoreCache, kzstdgpu_DecompressLiterals_StreamsPerGroup * kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache)

#include "../zstdgpu_lds_decl_size.h"
ZSTDGPU_DECOMPRESS_LITERALS_LDS_STORE_CACHE(0, DecompressLiterals_LdsStoreCache);
#include "../zstdgpu_lds_decl_undef.h"

struct Consts
{
uint32_t huffmanTableSlotCount;
};

ConstantBuffer<Consts> Constants : register(b0);

#include "../zstdgpu_srt_decl_bind.h"
ZSTDGPU_DECOMPRESS_LITERALS_SRT()
ZSTDGPU_RW_BUFFER_DECL(uint32_t, DecompressedLiteralsAsDwords, 1)
#include "../zstdgpu_srt_decl_undef.h"

groupshared uint32_t GS_Lds[kzstdgpu_DecompressLiterals_LdsStoreCache_LdsSize];
#define ZSTDGPU_LDS GS_Lds
#include "../zstdgpu_lds_hlsl.h"

[RootSignature("DescriptorTable(SRV(t0, numDescriptors=9), UAV(u0, numDescriptors=1)),UAV(u1), RootConstants(b0, num32BitConstants=1)")]
[numthreads(kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache, 1, 1)]
void main(uint groupId : SV_GroupId, uint i : SV_GroupThreadId)
{
zstdgpu_DecompressLiterals_SRT srt;

#include "../zstdgpu_srt_decl_copy.h"
ZSTDGPU_DECOMPRESS_LITERALS_SRT()
#include "../zstdgpu_srt_decl_undef.h"
srt.huffmanTableSlotCount = Constants.huffmanTableSlotCount;

if (groupId >= srt.inCounters[kzstdgpu_CounterIndex_DecompressLiteralsGroups])
return;

uint32_t htIndex = 0;
uint32_t htGroupStart = 0;
uint32_t htLiteralStart = 0;
uint32_t htLiteralCount = 0;

zstdgpu_ConvertThreadgroupIdToDecompressLiteralsInputs(
srt.inLitGroupEndPerHuffmanTable,
srt.inLitStreamEndPerHuffmanTable,
srt.huffmanTableSlotCount,
groupId,
htIndex,
htGroupStart,
htLiteralStart,
htLiteralCount
);

#include "../zstdgpu_lds_decl_base.h"
ZSTDGPU_DECOMPRESS_LITERALS_LDS_STORE_CACHE(0, DecompressLiterals_LdsStoreCache);
#include "../zstdgpu_lds_decl_undef.h"

const uint32_t htInfo = WaveReadLaneFirst(srt.inHuffmanTableInfo[htIndex]);
const uint32_t bitsMax = htInfo >> 16;
const uint32_t codeTableSize = htInfo & 0xffffu;
const uint32_t stateCnt = WaveReadLaneFirst(srt.inHuffmanTableRankIndex[htIndex * kzstdgpu_MaxCount_HuffmanWeightRanks + bitsMax]);
const uint32_t statePairCnt = stateCnt >> 1u;

// Expand Huffman Table — pack two symbol+bitcnt pairs per dword
ZSTDGPU_FOR_WORK_ITEMS(statePairId, statePairCnt, i, kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache)
{
const uint32_t stateId0 = statePairId << 1u;
const uint32_t stateId1 = stateId0 + 1u;

const uint32_t symbolIndex0 = zstdgpu_BinarySearchMasked(srt.inHuffmanTableCodeAndSymbol, htIndex * kzstdgpu_MaxCount_HuffmanWeights, codeTableSize, stateId0, 0x00ffffffu);
const uint32_t symbolIndex1 = zstdgpu_BinarySearchMasked(srt.inHuffmanTableCodeAndSymbol, htIndex * kzstdgpu_MaxCount_HuffmanWeights, codeTableSize, stateId1, 0x00ffffffu);

const uint32_t bitcntIndex0 = zstdgpu_BinarySearchMasked(srt.inHuffmanTableRankIndex, htIndex * kzstdgpu_MaxCount_HuffmanWeightRanks, bitsMax + 1, stateId0, 0xffffffffu)
- htIndex * kzstdgpu_MaxCount_HuffmanWeightRanks;

const uint32_t bitcntIndex1 = zstdgpu_BinarySearchMasked(srt.inHuffmanTableRankIndex, htIndex * kzstdgpu_MaxCount_HuffmanWeightRanks, bitsMax + 1, stateId1, 0xffffffffu)
- htIndex * kzstdgpu_MaxCount_HuffmanWeightRanks;

const uint32_t symbol0 = srt.inHuffmanTableCodeAndSymbol[symbolIndex0] >> 24;
const uint32_t bitcnt0 = bitsMax - bitcntIndex0;

const uint32_t symbol1 = srt.inHuffmanTableCodeAndSymbol[symbolIndex1] >> 24;
const uint32_t bitcnt1 = bitsMax - bitcntIndex1;

const uint32_t symbolAndBitcnt0 = (symbol0 << 8) | bitcnt0;
const uint32_t symbolAndBitcnt1 = (symbol1 << 8) | bitcnt1;

zstdgpu_LdsStoreU32(GS_HuffmanTable + statePairId, (symbolAndBitcnt1 << 16) | symbolAndBitcnt0);
}
GroupMemoryBarrierWithGroupSync();

zstdgpu_DecompressHuffmanCompressedLiterals_StoreLdsCache(
srt.inCompressedData,
srt.inLitStreamRemap,
srt.inLitRefs,
srt.inoutDecompressedLiterals,
ZstdInOutDecompressedLiteralsAsDwords,
GS_HuffmanTable,
GS_LiteralStoreCache,
groupId,
i,
htGroupStart,
htLiteralStart,
htLiteralCount,
bitsMax,
kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache,
kzstdgpu_DecompressLiterals_StreamsPerGroup,
kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache
);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache128_8.hlsl
*
* LDS store cache variant: 128 threads, 8 streams per group, 128 dwords cached per stream.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#define kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache 128
#define kzstdgpu_DecompressLiterals_StreamsPerGroup 8
#include "ZstdGpuDecompressLiterals_LdsStoreCache.hlsli"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache32_16.hlsl
*
* LDS store cache variant: 32 threads, 16 streams per group, 32 dwords cached per stream.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#define kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache 32
#define kzstdgpu_DecompressLiterals_StreamsPerGroup 16
#include "ZstdGpuDecompressLiterals_LdsStoreCache.hlsli"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache32_32.hlsl
*
* LDS store cache variant: 32 threads, 32 streams per group, 32 dwords cached per stream.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#define kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache 32
#define kzstdgpu_DecompressLiterals_StreamsPerGroup 32
#include "ZstdGpuDecompressLiterals_LdsStoreCache.hlsli"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache32_8.hlsl
*
* LDS store cache variant: 32 threads, 8 streams per group, 32 dwords cached per stream.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#define kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache 32
#define kzstdgpu_DecompressLiterals_StreamsPerGroup 8
#include "ZstdGpuDecompressLiterals_LdsStoreCache.hlsli"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache64_16.hlsl
*
* LDS store cache variant: 64 threads, 16 streams per group, 64 dwords cached per stream.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#define kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache 64
#define kzstdgpu_DecompressLiterals_StreamsPerGroup 16
#include "ZstdGpuDecompressLiterals_LdsStoreCache.hlsli"
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* ZstdGpuDecompressLiterals_LdsStoreCache64_8.hlsl
*
* LDS store cache variant: 64 threads, 8 streams per group, 64 dwords cached per stream.
*
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#define kzstdgpu_TgSizeX_DecompressLiterals_LdsStoreCache 64
#define kzstdgpu_DecompressLiterals_StreamsPerGroup 8
#include "ZstdGpuDecompressLiterals_LdsStoreCache.hlsli"
Loading