Untitled
unknown
csharp
5 months ago
6.9 kB
8
Indexable
void Cull(InstancingInfo info) { int threadCount = Mathf.CeilToInt(info.dataBuffer.count / 128f); int numThreadGroups = threadCount > 128 ? 2 : 1; cullingComputeShader.SetBuffer(4, "_ArgsBuffer", info.argsBuffer); cullingComputeShader.Dispatch(4, 1, 1, 1); Matrix4x4 vpMatrix = cullingCamera.projectionMatrix * cullingCamera.worldToCameraMatrix; Matrix4x4 vp2Matrix = cullingCamera.projectionMatrix * cullingCamera.transform.worldToLocalMatrix; cullingComputeShader.SetMatrix("MATRIX_VP", vpMatrix); cullingComputeShader.SetBuffer(0, "_GrassDataBuffer", info.dataBuffer); cullingComputeShader.SetBuffer(0, "_VoteBuffer", voteBuffer); cullingComputeShader.SetVector("_CameraPosition", cullingCamera.transform.position); cullingComputeShader.SetFloat("_Distance", 200f); cullingComputeShader.Dispatch(0, threadCount, 1, 1); // Scan cullingComputeShader.SetBuffer(1, "_VoteBuffer", voteBuffer); cullingComputeShader.SetBuffer(1, "_ScanBuffer", scanBuffer); cullingComputeShader.SetBuffer(1, "_GroupSumArray", groupSumArrayBuffer); cullingComputeShader.Dispatch(1, threadCount, 1, 1); // Scan Groups cullingComputeShader.SetInt("_NumOfGroups", numThreadGroups); cullingComputeShader.SetBuffer(2, "_GroupSumArrayIn", groupSumArrayBuffer); cullingComputeShader.SetBuffer(2, "_GroupSumArrayOut", scannedGroupSumBuffer); cullingComputeShader.Dispatch(2, threadCount, 1, 1); // Compact cullingComputeShader.SetBuffer(3, "_GrassDataBuffer", info.dataBuffer); cullingComputeShader.SetBuffer(3, "_VoteBuffer", voteBuffer); cullingComputeShader.SetBuffer(3, "_ScanBuffer", scanBuffer); cullingComputeShader.SetBuffer(3, "_ArgsBuffer", info.argsBuffer); cullingComputeShader.SetBuffer(3, "_CulledGrassOutputBuffer", info.dataBuffer); cullingComputeShader.SetBuffer(3, "_GroupSumArray", scannedGroupSumBuffer); cullingComputeShader.Dispatch(3, threadCount, 1, 1); } #pragma kernel Vote #pragma kernel Scan #pragma kernel ScanGroupSums #pragma kernel Compact #pragma kernel ResetArgs #define NUM_THREAD_GROUPS_X 64 struct InstanceData { float4x4 TRS; float3 Normal; }; RWStructuredBuffer<uint> _ArgsBuffer; RWStructuredBuffer<InstanceData> _GrassDataBuffer; RWStructuredBuffer<uint> _VoteBuffer; RWStructuredBuffer<uint> _ScanBuffer; RWStructuredBuffer<uint> _GroupSumArray; RWStructuredBuffer<uint> _GroupSumArrayIn; RWStructuredBuffer<uint> _GroupSumArrayOut; RWStructuredBuffer<InstanceData> _CulledGrassOutputBuffer; float4x4 MATRIX_VP; int _NumOfGroups; groupshared uint temp[2 * NUM_THREAD_GROUPS_X]; groupshared uint grouptemp[2 * 1024]; float _Distance; float3 _CameraPosition; [numthreads(128, 1, 1)] void Vote(uint3 id : SV_DispatchThreadID) { if (id.x >= _GrassDataBuffer.Length) return; float4 position = float4(_GrassDataBuffer[id.x].TRS._m03_m13_m23, 1.0f); bool inView = position.y > 10; // Should cull out every blade of grass under 10 units, but instead it culls half of the grass on the right side of the map // Potentially 50% of grass is under 10 units, so the number of grass meshes being culled is correct, but it does not respect location _VoteBuffer[id.x] = inView ? 1 : 0; } // From GPU Gems Chapter 39 'Parallel Prefix Sum (Scan) with CUDA' [numthreads(NUM_THREAD_GROUPS_X, 1, 1)] void Scan(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) { int tid = (int) id.x; int groupTID = (int) groupThreadID.x; int groupID = (int) _groupID.x; int offset = 1; temp[2 * groupTID] = _VoteBuffer[2 * tid]; temp[2 * groupTID + 1] = _VoteBuffer[2 * tid + 1]; int d; int numElements = 2 * NUM_THREAD_GROUPS_X; for (d = numElements >> 1; d > 0; d >>= 1) { GroupMemoryBarrierWithGroupSync(); if (groupTID < d) { int ai = offset * (2 * groupTID + 1) - 1; int bi = offset * (2 * groupTID + 2) - 1; temp[bi] += temp[ai]; } offset *= 2; } if (groupTID == 0) { _GroupSumArray[_groupID.x] = temp[numElements - 1]; temp[numElements - 1] = 0; } for (d = 1; d < numElements; d *= 2) { offset >>= 1; GroupMemoryBarrierWithGroupSync(); if (groupTID < d) { int ai = offset * (2 * groupTID + 1) - 1; int bi = offset * (2 * groupTID + 2) - 1; int t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } } GroupMemoryBarrierWithGroupSync(); _ScanBuffer[2 * tid] = temp[2 * groupTID]; _ScanBuffer[2 * tid + 1] = temp[2 * groupTID + 1]; } // From GPU Gems Chapter 39 'Parallel Prefix Sum (Scan) with CUDA' [numthreads(1024, 1, 1)] void ScanGroupSums(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) { int tid = (int) id.x; int groupTID = (int) groupThreadID.x; int groupID = (int) _groupID.x; int offset = 1; grouptemp[2 * groupTID] = _GroupSumArrayIn[2 * tid]; grouptemp[2 * groupTID + 1] = _GroupSumArrayIn[2 * tid + 1]; int d; for (d = _NumOfGroups >> 1; d > 0; d >>= 1) { GroupMemoryBarrierWithGroupSync(); if (groupTID < d) { int ai = offset * (2 * groupTID + 1) - 1; int bi = offset * (2 * groupTID + 2) - 1; grouptemp[bi] += grouptemp[ai]; } offset *= 2; } if (tid == 0) grouptemp[_NumOfGroups - 1] = 0; for (d = 1; d < _NumOfGroups; d *= 2) { offset >>= 1; GroupMemoryBarrierWithGroupSync(); if (tid < d) { int ai = offset * (2 * groupTID + 1) - 1; int bi = offset * (2 * groupTID + 2) - 1; int t = grouptemp[ai]; grouptemp[ai] = grouptemp[bi]; grouptemp[bi] += t; } } GroupMemoryBarrierWithGroupSync(); _GroupSumArrayOut[2 * tid] = grouptemp[2 * tid]; _GroupSumArrayOut[2 * tid + 1] = grouptemp[2 * tid + 1]; } [numthreads(128, 1, 1)] void Compact(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) { uint tid = id.x; uint groupID = id.x / 128; uint groupSum = groupID > 0 ? _GroupSumArray[groupID - 1] : 0; bool inCamera = _VoteBuffer[id.x] == 1; if (inCamera) { InterlockedAdd(_ArgsBuffer[1], 1); _CulledGrassOutputBuffer[_ScanBuffer[tid] + groupSum] = _GrassDataBuffer[tid]; } } [numthreads(1, 1, 1)] void ResetArgs(uint3 id : SV_DISPATCHTHREADID) { _ArgsBuffer[1] = (uint)0; }
Editor is loading...
Leave a Comment