代码之家  ›  专栏  ›  技术社区  ›  SurvivalMachine

如何修复“缓冲区预加载失败”计算着色器性能问题?

  •  0
  • SurvivalMachine  · 技术社区  · 7 年前

    从应用程序捕获GPU帧时,我在“管道统计信息”下看到以下消息->备注:

    缓冲区预加载失败

    确保数据大小是4的倍数 字节并与4字节对齐,然后尝试使用简单的访问模式。 对于固定缓冲区,请尝试使用固定缓冲区大小。

    无法升级PointLightBufferCenter和Radius- 打火机。金属:打火机

    这是我的缓冲区初始化:

    const int MaxLights = 2048;
    pointLightCenterAndRadiusBuffer = [GfxDevice::GetMetalDevice() newBufferWithLength:MaxLights * sizeof( Vec4 )
                                 options:MTLResourceCPUCacheModeDefaultCache];
    pointLightCenterAndRadiusBuffer.label = @"pointLightCenterAndRadiusBuffer";
    

    以下是我的着色器的相关部分:

    kernel void light_culler(texture2d<float, access::read> depthNormalsTexture [[texture(0)]],
                             constant Uniforms& uniforms [[ buffer(0) ]],
                             constant float4* pointLightBufferCenterAndRadius [[ buffer(1) ]],
                             device uint* perTileLightIndexBufferOut [[ buffer(2) ]],
                             constant float4* spotLightBufferCenterAndRadius [[ buffer(3) ]],
                             uint2 gid [[thread_position_in_grid]],
                             uint2 tid [[thread_position_in_threadgroup]],
                             uint2 dtid [[threadgroup_position_in_grid]])
    {
        threadgroup uint ldsLightIdx[ MAX_NUM_LIGHTS_PER_TILE ];
        threadgroup atomic_uint ldsZMax;
        threadgroup atomic_uint ldsZMin;
        threadgroup atomic_uint ldsLightIdxCounter;
    
        uint2 globalIdx = gid;
        uint2 localIdx = tid;
        uint2 groupIdx = dtid;
    
        uint localIdxFlattened = localIdx.x + localIdx.y * TILE_RES;
        uint tileIdxFlattened = groupIdx.x + groupIdx.y * GetNumTilesX( uniforms.windowWidth );
    
        if (localIdxFlattened == 0)
        {
            atomic_store_explicit( &ldsZMin, 0x7f7fffff, memory_order_relaxed ); // FLT_MAX as uint
            atomic_store_explicit( &ldsZMax, 0, memory_order_relaxed );
            atomic_store_explicit( &ldsLightIdxCounter, 0, memory_order_relaxed );
        }
    
        float4 frustumEqn[ 4 ];
        {
            uint pxm = TILE_RES * groupIdx.x;
            uint pym = TILE_RES * groupIdx.y;
            uint pxp = TILE_RES * (groupIdx.x + 1);
            uint pyp = TILE_RES * (groupIdx.y + 1);
    
            float winWidth  = float( TILE_RES * GetNumTilesX( uniforms.windowWidth ) );
            float winHeight = float( TILE_RES * GetNumTilesY( uniforms.windowHeight) );
    
            float4 v0 = float4( pxm / winWidth * 2.0f - 1.0f, (winHeight - pym) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
            float4 v1 = float4( pxp / winWidth * 2.0f - 1.0f, (winHeight - pym) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
            float4 v2 = float4( pxp / winWidth * 2.0f - 1.0f, (winHeight - pyp) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
            float4 v3 = float4( pxm / winWidth * 2.0f - 1.0f, (winHeight - pyp) / winHeight * 2.0f - 1.0f, 1.0f, 1.0f );
    
            float4 frustum[ 4 ];
            frustum[ 0 ] = ConvertClipToView( v0, uniforms.clipToView );
            frustum[ 1 ] = ConvertClipToView( v1, uniforms.clipToView );
            frustum[ 2 ] = ConvertClipToView( v2, uniforms.clipToView );
            frustum[ 3 ] = ConvertClipToView( v3, uniforms.clipToView );
    
            for (uint i = 0; i < 4; ++i)
            {
                frustumEqn[ i ] = CreatePlaneEquation( frustum[ i ], frustum[ (i + 1) & 3 ] );
            }
        }
    
        threadgroup_barrier( mem_flags::mem_threadgroup );
    
        float minZ = FLT_MAX;
        float maxZ = 0.0f;
    
        float depth = depthNormalsTexture.read( globalIdx.xy ).x;
    
        uint z = as_type< uint >( depth );
    
        if (depth != 0.0f)
        {
            /*uint i =*/ atomic_fetch_min_explicit( &ldsZMin, z, memory_order::memory_order_relaxed );
            /*uint j =*/ atomic_fetch_max_explicit( &ldsZMax, z, memory_order::memory_order_relaxed );
        }
    
        threadgroup_barrier( mem_flags::mem_threadgroup );
    
        uint zMin = atomic_load_explicit( &ldsZMin, memory_order::memory_order_relaxed );
        uint zMax = atomic_load_explicit( &ldsZMax, memory_order::memory_order_relaxed );
        minZ = as_type< float >( zMax );
        maxZ = as_type< float >( zMin );
    
        int numPointLights = uniforms.numLights & 0xFFFFu;
    
        for (int i = 0; i < numPointLights; i += NUM_THREADS_PER_TILE)
        {
            int il = localIdxFlattened + i;
    
            if (il < numPointLights)
            {
                float4 center = pointLightBufferCenterAndRadius[ il ];
                float radius = center.w;
                center.xyz = (uniforms.localToView * float4( center.xyz, 1.0f ) ).xyz;
    
                if (-center.z + minZ < radius && center.z - maxZ < radius)
                {
                    if ((GetSignedDistanceFromPlane( center, frustumEqn[ 0 ] ) < radius) &&
                        (GetSignedDistanceFromPlane( center, frustumEqn[ 1 ] ) < radius) &&
                        (GetSignedDistanceFromPlane( center, frustumEqn[ 2 ] ) < radius) &&
                        (GetSignedDistanceFromPlane( center, frustumEqn[ 3 ] ) < radius))
                    {
                        // do a thread-safe increment of the list counter
                        // and put the index of this light into the list
                        int dstIdx = atomic_fetch_add_explicit( &ldsLightIdxCounter, 1, memory_order::memory_order_relaxed );
                        ldsLightIdx[ dstIdx ] = il;
                    }
                }
            }
        }
    
        threadgroup_barrier( mem_flags::mem_threadgroup );
    
        int numPointLightsInThisTile = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
    
        // Spot lights.
        int numSpotLights = (uniforms.numLights & 0xFFFF0000u) >> 16;
    
        for (int i = 0; i < numSpotLights; i += NUM_THREADS_PER_TILE)
        {
            int il = localIdxFlattened + i;
    
            if (il < numSpotLights)
            {
                float4 center = spotLightBufferCenterAndRadius[ il ];
                float radius = center.w * 5.0f; // FIXME: Multiply was added, but more clever culling should be done instead.
                center.xyz = (uniforms.localToView * float4( center.xyz, 1.0f )).xyz;
    
                if (-center.z + minZ < radius && center.z - maxZ < radius)
                {
                    if ((GetSignedDistanceFromPlane( center, frustumEqn[ 0 ] ) < radius) &&
                        (GetSignedDistanceFromPlane( center, frustumEqn[ 1 ] ) < radius) &&
                        (GetSignedDistanceFromPlane( center, frustumEqn[ 2 ] ) < radius) &&
                        (GetSignedDistanceFromPlane( center, frustumEqn[ 3 ] ) < radius))
                    {
                        int dstIdx = atomic_fetch_add_explicit( &ldsLightIdxCounter, 1, memory_order::memory_order_relaxed );
                        ldsLightIdx[ dstIdx ] = il;
                    }
                }
            }
        }
        threadgroup_barrier( mem_flags::mem_threadgroup );
    
        {   // write back
            int startOffset = uniforms.maxNumLightsPerTile * tileIdxFlattened;
    
            for (int i = localIdxFlattened; i < numPointLightsInThisTile; i += NUM_THREADS_PER_TILE)
            {
                // per-tile list of light indices
                perTileLightIndexBufferOut[ startOffset + i ] = ldsLightIdx[ i ];
            }
    
            int jMax = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
            for (int j = localIdxFlattened + numPointLightsInThisTile; j < jMax; j += NUM_THREADS_PER_TILE)
            {
                // per-tile list of light indices
                perTileLightIndexBufferOut[ startOffset + j + 1 ] = ldsLightIdx[ j ];
            }
    
            if (localIdxFlattened == 0)
            {
                perTileLightIndexBufferOut[ startOffset + numPointLightsInThisTile ] = LIGHT_INDEX_BUFFER_SENTINEL;
    
                int offs = atomic_load_explicit( &ldsLightIdxCounter, memory_order::memory_order_relaxed );
                perTileLightIndexBufferOut[ startOffset + offs + 1 ] = LIGHT_INDEX_BUFFER_SENTINEL;
            }
        }
    }
    

    我正在使用iOS11.4和Xcode9.4在iPadPro 10.5“上调试这个应用程序。如何修复警告?

    我还试图将缓冲区的类型从 constant float4* constant PointLight& pointLightBufferCenterAndRadius 在哪里 PointLight struct PointLight { float4 d[ 2048 ]; } 苹果的金属建议 WWDC talk .

    1 回复  |  直到 7 年前
        1
  •  0
  •   endomachi    7 年前

    此警告通常不是灾难性性能打击的迹象。因此,解决这个问题可能不会给您带来很大的收益,您可能需要考虑优化内核的其他部分。

    避免这种情况的主要方法是在顶点着色器或计算内核中使用[[stage-in]]输入来获取逐顶点/线程数据。这并不总是可能的,这取决于使用的算法,因为您可能无法访问数据“按顺序”作为一个[[阶段输入]]输入。

    推荐文章