Untitled

void Cull(InstancingInfo info) {
    int threadCount = Mathf.CeilToInt(info.dataBuffer.count / 128f);
    int numThreadGroups = threadCount > 128 ? 2 : 1;

    cullingComputeShader.SetBuffer(4, "_ArgsBuffer", info.argsBuffer);
    cullingComputeShader.Dispatch(4, 1, 1, 1);

    Matrix4x4 vpMatrix = cullingCamera.projectionMatrix * cullingCamera.worldToCameraMatrix;
    Matrix4x4 vp2Matrix = cullingCamera.projectionMatrix * cullingCamera.transform.worldToLocalMatrix;

    cullingComputeShader.SetMatrix("MATRIX_VP", vpMatrix);
    cullingComputeShader.SetBuffer(0, "_GrassDataBuffer", info.dataBuffer);
    cullingComputeShader.SetBuffer(0, "_VoteBuffer", voteBuffer);
    cullingComputeShader.SetVector("_CameraPosition", cullingCamera.transform.position);
    cullingComputeShader.SetFloat("_Distance", 200f);
    cullingComputeShader.Dispatch(0, threadCount, 1, 1);

    // Scan
    cullingComputeShader.SetBuffer(1, "_VoteBuffer", voteBuffer);
    cullingComputeShader.SetBuffer(1, "_ScanBuffer", scanBuffer);
    cullingComputeShader.SetBuffer(1, "_GroupSumArray", groupSumArrayBuffer);
    cullingComputeShader.Dispatch(1, threadCount, 1, 1);

    // Scan Groups
    cullingComputeShader.SetInt("_NumOfGroups", numThreadGroups);
    cullingComputeShader.SetBuffer(2, "_GroupSumArrayIn", groupSumArrayBuffer);
    cullingComputeShader.SetBuffer(2, "_GroupSumArrayOut", scannedGroupSumBuffer);
    cullingComputeShader.Dispatch(2, threadCount, 1, 1);

    // Compact
    cullingComputeShader.SetBuffer(3, "_GrassDataBuffer", info.dataBuffer);
    cullingComputeShader.SetBuffer(3, "_VoteBuffer", voteBuffer);
    cullingComputeShader.SetBuffer(3, "_ScanBuffer", scanBuffer);
    cullingComputeShader.SetBuffer(3, "_ArgsBuffer", info.argsBuffer);
    cullingComputeShader.SetBuffer(3, "_CulledGrassOutputBuffer", info.dataBuffer);
    cullingComputeShader.SetBuffer(3, "_GroupSumArray", scannedGroupSumBuffer);
    cullingComputeShader.Dispatch(3, threadCount, 1, 1);
}

#pragma kernel Vote
#pragma kernel Scan
#pragma kernel ScanGroupSums
#pragma kernel Compact
#pragma kernel ResetArgs

#define NUM_THREAD_GROUPS_X 64

struct InstanceData
{
    float4x4 TRS;
    float3 Normal;
};

RWStructuredBuffer<uint> _ArgsBuffer;
RWStructuredBuffer<InstanceData> _GrassDataBuffer;
RWStructuredBuffer<uint> _VoteBuffer;
RWStructuredBuffer<uint> _ScanBuffer;
RWStructuredBuffer<uint> _GroupSumArray;
RWStructuredBuffer<uint> _GroupSumArrayIn;
RWStructuredBuffer<uint> _GroupSumArrayOut;
RWStructuredBuffer<InstanceData> _CulledGrassOutputBuffer;

float4x4 MATRIX_VP;
int _NumOfGroups;
groupshared uint temp[2 * NUM_THREAD_GROUPS_X];
groupshared uint grouptemp[2 * 1024];
float _Distance;
float3 _CameraPosition;

[numthreads(128, 1, 1)]
void Vote(uint3 id : SV_DispatchThreadID) {
    if (id.x >= _GrassDataBuffer.Length) return;
    float4 position = float4(_GrassDataBuffer[id.x].TRS._m03_m13_m23, 1.0f);
    bool inView = position.y > 10;

    // Should cull out every blade of grass under 10 units, but instead it culls half of the grass on the right side of the map
    // Potentially 50% of grass is under 10 units, so the number of grass meshes being culled is correct, but it does not respect location

    _VoteBuffer[id.x] = inView ? 1 : 0;
}

// From GPU Gems Chapter 39 'Parallel Prefix Sum (Scan) with CUDA'
[numthreads(NUM_THREAD_GROUPS_X, 1, 1)]
void Scan(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) {
    int tid = (int) id.x;
    int groupTID = (int) groupThreadID.x;
    int groupID = (int) _groupID.x;

    int offset = 1;
    temp[2 * groupTID] = _VoteBuffer[2 * tid];
    temp[2 * groupTID + 1] = _VoteBuffer[2 * tid + 1];
    int d;
    int numElements = 2 * NUM_THREAD_GROUPS_X;

    for (d = numElements >> 1; d > 0; d >>= 1) {
        GroupMemoryBarrierWithGroupSync();

        if (groupTID < d) {
            int ai = offset * (2 * groupTID + 1) - 1;
            int bi = offset * (2 * groupTID + 2) - 1;
            temp[bi] += temp[ai];
        }

        offset *= 2;
    }

    if (groupTID == 0) {
        _GroupSumArray[_groupID.x] = temp[numElements - 1];
        temp[numElements - 1] = 0;
    }

    for (d = 1; d < numElements; d *= 2) {
        offset >>= 1;

        GroupMemoryBarrierWithGroupSync();
        if (groupTID < d) {
            int ai = offset * (2 * groupTID + 1) - 1;
            int bi = offset * (2 * groupTID + 2) - 1;
            int t = temp[ai];
            temp[ai] = temp[bi];
            temp[bi] += t;
        }
    }

    GroupMemoryBarrierWithGroupSync();

    _ScanBuffer[2 * tid] = temp[2 * groupTID];
    _ScanBuffer[2 * tid + 1] = temp[2 * groupTID + 1];
}

// From GPU Gems Chapter 39 'Parallel Prefix Sum (Scan) with CUDA'
[numthreads(1024, 1, 1)]
void ScanGroupSums(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) {
    int tid = (int) id.x;
    int groupTID = (int) groupThreadID.x;
    int groupID = (int) _groupID.x;

    int offset = 1;
    grouptemp[2 * groupTID] = _GroupSumArrayIn[2 * tid];
    grouptemp[2 * groupTID + 1] = _GroupSumArrayIn[2 * tid + 1];
    int d;

    for (d = _NumOfGroups >> 1; d > 0; d >>= 1) {
        GroupMemoryBarrierWithGroupSync();

        if (groupTID < d) {
            int ai = offset * (2 * groupTID + 1) - 1;
            int bi = offset * (2 * groupTID + 2) - 1;
            grouptemp[bi] += grouptemp[ai];
        }

        offset *= 2;
    }

    if (tid == 0)
        grouptemp[_NumOfGroups - 1] = 0;

    for (d = 1; d < _NumOfGroups; d *= 2) {
        offset >>= 1;

        GroupMemoryBarrierWithGroupSync();
        if (tid < d) {
            int ai = offset * (2 * groupTID + 1) - 1;
            int bi = offset * (2 * groupTID + 2) - 1;
            int t = grouptemp[ai];
            grouptemp[ai] = grouptemp[bi];
            grouptemp[bi] += t;
        }
    }

    GroupMemoryBarrierWithGroupSync();

    _GroupSumArrayOut[2 * tid] = grouptemp[2 * tid];
    _GroupSumArrayOut[2 * tid + 1] = grouptemp[2 * tid + 1];
}

[numthreads(128, 1, 1)]
void Compact(uint3 id : SV_DISPATCHTHREADID, uint groupIndex : SV_GROUPINDEX, uint3 _groupID : SV_GROUPID, uint3 groupThreadID : SV_GROUPTHREADID) {
    uint tid = id.x;
    uint groupID = id.x / 128;
    uint groupSum = groupID > 0 ? _GroupSumArray[groupID - 1] : 0;
    bool inCamera = _VoteBuffer[id.x] == 1;
    
    if (inCamera) {
        InterlockedAdd(_ArgsBuffer[1], 1);
        _CulledGrassOutputBuffer[_ScanBuffer[tid] + groupSum] = _GrassDataBuffer[tid];
    }
}

[numthreads(1, 1, 1)]
void ResetArgs(uint3 id : SV_DISPATCHTHREADID) {
    _ArgsBuffer[1] = (uint)0;
}
Editor is loading...