Untitled
unknown
csharp
a year ago
6.9 kB
20
Indexable
void Cull(InstancingInfo info) {
int threadCount = Mathf.CeilToInt(info.dataBuffer.count / 128f);
int numThreadGroups = threadCount > 128 ? 2 : 1;
cullingComputeShader.SetBuffer(4, "_ArgsBuffer", info.argsBuffer);
cullingComputeShader.Dispatch(4, 1, 1, 1);
Matrix4x4 vpMatrix = cullingCamera.projectionMatrix * cullingCamera.worldToCameraMatrix;
Matrix4x4 vp2Matrix = cullingCamera.projectionMatrix * cullingCamera.transform.worldToLocalMatrix;
cullingComputeShader.SetMatrix("MATRIX_VP", vpMatrix);
cullingComputeShader.SetBuffer(0, "_GrassDataBuffer", info.dataBuffer);
cullingComputeShader.SetBuffer(0, "_VoteBuffer", voteBuffer);
cullingComputeShader.SetVector("_CameraPosition", cullingCamera.transform.position);
cullingComputeShader.SetFloat("_Distance", 200f);
cullingComputeShader.Dispatch(0, threadCount, 1, 1);
// Scan
cullingComputeShader.SetBuffer(1, "_VoteBuffer", voteBuffer);
cullingComputeShader.SetBuffer(1, "_ScanBuffer", scanBuffer);
cullingComputeShader.SetBuffer(1, "_GroupSumArray", groupSumArrayBuffer);
cullingComputeShader.Dispatch(1, threadCount, 1, 1);
// Scan Groups
cullingComputeShader.SetInt("_NumOfGroups", numThreadGroups);
cullingComputeShader.SetBuffer(2, "_GroupSumArrayIn", groupSumArrayBuffer);
cullingComputeShader.SetBuffer(2, "_GroupSumArrayOut", scannedGroupSumBuffer);
cullingComputeShader.Dispatch(2, threadCount, 1, 1);
// Compact
cullingComputeShader.SetBuffer(3, "_GrassDataBuffer", info.dataBuffer);
cullingComputeShader.SetBuffer(3, "_VoteBuffer", voteBuffer);
cullingComputeShader.SetBuffer(3, "_ScanBuffer", scanBuffer);
cullingComputeShader.SetBuffer(3, "_ArgsBuffer", info.argsBuffer);
cullingComputeShader.SetBuffer(3, "_CulledGrassOutputBuffer", info.dataBuffer);
cullingComputeShader.SetBuffer(3, "_GroupSumArray", scannedGroupSumBuffer);
cullingComputeShader.Dispatch(3, threadCount, 1, 1);
}
#pragma kernel Vote
#pragma kernel Scan
#pragma kernel ScanGroupSums
#pragma kernel Compact
#pragma kernel ResetArgs
#define NUM_THREAD_GROUPS_X 64
struct InstanceData
{
float4x4 TRS;
float3 Normal;
};
RWStructuredBuffer<uint> _ArgsBuffer;
RWStructuredBuffer<InstanceData> _GrassDataBuffer;
RWStructuredBuffer<uint> _VoteBuffer;
RWStructuredBuffer<uint> _ScanBuffer;
RWStructuredBuffer<uint> _GroupSumArray;
RWStructuredBuffer<uint> _GroupSumArrayIn;
RWStructuredBuffer<uint> _GroupSumArrayOut;
RWStructuredBuffer<InstanceData> _CulledGrassOutputBuffer;
float4x4 MATRIX_VP;
int _NumOfGroups;
groupshared uint temp[2 * NUM_THREAD_GROUPS_X];
groupshared uint grouptemp[2 * 1024];
float _Distance;
float3 _CameraPosition;
[numthreads(128, 1, 1)]
void Vote(uint3 id : SV_DispatchThreadID) {
if (id.x >= _GrassDataBuffer.Length) return;
float4 position = float4(_GrassDataBuffer[id.x].TRS._m03_m13_m23, 1.0f);
bool inView = position.y > 10;
// Should cull out every blade of grass under 10 units, but instead it culls half of the grass on the right side of the map
// Potentially 50% of grass is under 10 units, so the number of grass meshes being culled is correct, but it does not respect location
_VoteBuffer[id.x] = inView ? 1 : 0;
}
// From GPU Gems Chapter 39 'Parallel Prefix Sum (Scan) with CUDA'
[numthreads(NUM_THREAD_GROUPS_X, 1, 1)]
void Scan(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) {
int tid = (int) id.x;
int groupTID = (int) groupThreadID.x;
int groupID = (int) _groupID.x;
int offset = 1;
temp[2 * groupTID] = _VoteBuffer[2 * tid];
temp[2 * groupTID + 1] = _VoteBuffer[2 * tid + 1];
int d;
int numElements = 2 * NUM_THREAD_GROUPS_X;
for (d = numElements >> 1; d > 0; d >>= 1) {
GroupMemoryBarrierWithGroupSync();
if (groupTID < d) {
int ai = offset * (2 * groupTID + 1) - 1;
int bi = offset * (2 * groupTID + 2) - 1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (groupTID == 0) {
_GroupSumArray[_groupID.x] = temp[numElements - 1];
temp[numElements - 1] = 0;
}
for (d = 1; d < numElements; d *= 2) {
offset >>= 1;
GroupMemoryBarrierWithGroupSync();
if (groupTID < d) {
int ai = offset * (2 * groupTID + 1) - 1;
int bi = offset * (2 * groupTID + 2) - 1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
GroupMemoryBarrierWithGroupSync();
_ScanBuffer[2 * tid] = temp[2 * groupTID];
_ScanBuffer[2 * tid + 1] = temp[2 * groupTID + 1];
}
// From GPU Gems Chapter 39 'Parallel Prefix Sum (Scan) with CUDA'
[numthreads(1024, 1, 1)]
void ScanGroupSums(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) {
int tid = (int) id.x;
int groupTID = (int) groupThreadID.x;
int groupID = (int) _groupID.x;
int offset = 1;
grouptemp[2 * groupTID] = _GroupSumArrayIn[2 * tid];
grouptemp[2 * groupTID + 1] = _GroupSumArrayIn[2 * tid + 1];
int d;
for (d = _NumOfGroups >> 1; d > 0; d >>= 1) {
GroupMemoryBarrierWithGroupSync();
if (groupTID < d) {
int ai = offset * (2 * groupTID + 1) - 1;
int bi = offset * (2 * groupTID + 2) - 1;
grouptemp[bi] += grouptemp[ai];
}
offset *= 2;
}
if (tid == 0)
grouptemp[_NumOfGroups - 1] = 0;
for (d = 1; d < _NumOfGroups; d *= 2) {
offset >>= 1;
GroupMemoryBarrierWithGroupSync();
if (tid < d) {
int ai = offset * (2 * groupTID + 1) - 1;
int bi = offset * (2 * groupTID + 2) - 1;
int t = grouptemp[ai];
grouptemp[ai] = grouptemp[bi];
grouptemp[bi] += t;
}
}
GroupMemoryBarrierWithGroupSync();
_GroupSumArrayOut[2 * tid] = grouptemp[2 * tid];
_GroupSumArrayOut[2 * tid + 1] = grouptemp[2 * tid + 1];
}
[numthreads(128, 1, 1)]
void Compact(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) {
uint tid = id.x;
uint groupID = id.x / 128;
uint groupSum = groupID > 0 ? _GroupSumArray[groupID - 1] : 0;
bool inCamera = _VoteBuffer[id.x] == 1;
if (inCamera) {
InterlockedAdd(_ArgsBuffer[1], 1);
_CulledGrassOutputBuffer[_ScanBuffer[tid] + groupSum] = _GrassDataBuffer[tid];
}
}
[numthreads(1, 1, 1)]
void ResetArgs(uint3 id : SV_DISPATCHTHREADID) {
_ArgsBuffer[1] = (uint)0;
}
Editor is loading...
Leave a Comment