我看到了一个通过使用金属计算管道来提高应用程序性能的机会。然而,我最初的测试显示,计算管道的速度非常慢(至少在旧设备上是如此)。
所以我做了一个示例项目来比较计算和渲染管道的性能。该程序获取2048 x 2048源纹理,并将其转换为目标纹理中的灰度。
在iPhone 5S上,片段着色器需要3毫秒才能完成转换。然而,计算内核做同样的事情需要177毫秒。这是59倍长!!!
您对旧设备上的计算管道有什么经验?速度不是很慢吗?
以下是我的片段和计算函数:
// Grayscale Fragment Function
fragment half4 grayscaleFragment(RasterizerData in [[stage_in]],
texture2d<half> inTexture [[texture(0)]])
{
constexpr sampler textureSampler;
half4 inColor = inTexture.sample(textureSampler, in.textureCoordinate);
half gray = dot(inColor.rgb, kRec709Luma);
return half4(gray, gray, gray, 1.0);
}
// Grayscale Kernel Function
kernel void grayscaleKernel(uint2 gid [[thread_position_in_grid]],
texture2d<half, access::read> inTexture [[texture(0)]],
texture2d<half, access::write> outTexture [[texture(1)]])
{
half4 inColor = inTexture.read(gid);
half gray = dot(inColor.rgb, kRec709Luma);
outTexture.write(half4(gray, gray, gray, 1.0), gid);
}
计算和渲染方法
- (void)compute {
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
// Compute encoder
id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
[computeEncoder setComputePipelineState:_computePipelineState];
[computeEncoder setTexture:_srcTexture atIndex:0];
[computeEncoder setTexture:_dstTexture atIndex:1];
[computeEncoder dispatchThreadgroups:_threadgroupCount threadsPerThreadgroup:_threadgroupSize];
[computeEncoder endEncoding];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
}
- (void)render {
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
// Render pass descriptor
MTLRenderPassDescriptor *renderPassDescriptor = [MTLRenderPassDescriptor renderPassDescriptor];
renderPassDescriptor.colorAttachments[0].loadAction = MTLLoadActionDontCare;
renderPassDescriptor.colorAttachments[0].texture = _dstTexture;
renderPassDescriptor.colorAttachments[0].storeAction = MTLStoreActionStore;
// Render encoder
id<MTLRenderCommandEncoder> renderEncoder = [commandBuffer renderCommandEncoderWithDescriptor:renderPassDescriptor];
[renderEncoder setRenderPipelineState:_renderPipelineState];
[renderEncoder setFragmentTexture:_srcTexture atIndex:0];
[renderEncoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:0 vertexCount:4];
[renderEncoder endEncoding];
[commandBuffer commit];
[commandBuffer waitUntilCompleted];
}
和金属装置:
- (void)setupMetal
{
// Get metal device
_device = MTLCreateSystemDefaultDevice();
// Create the command queue
_commandQueue = [_device newCommandQueue];
id<MTLLibrary> defaultLibrary = [_device newDefaultLibrary];
// Create compute pipeline state
_computePipelineState = [_device newComputePipelineStateWithFunction:[defaultLibrary newFunctionWithName:@"grayscaleKernel"] error:nil];
// Create render pipeline state
MTLRenderPipelineDescriptor *pipelineStateDescriptor = [[MTLRenderPipelineDescriptor alloc] init];
pipelineStateDescriptor.vertexFunction = [defaultLibrary newFunctionWithName:@"vertexShader"];
pipelineStateDescriptor.fragmentFunction = [defaultLibrary newFunctionWithName:@"grayscaleFragment"];
pipelineStateDescriptor.colorAttachments[0].pixelFormat = MTLPixelFormatBGRA8Unorm;
_renderPipelineState = [_device newRenderPipelineStateWithDescriptor:pipelineStateDescriptor error:nil];
// Create source and destination texture descriptor
// Since the compute kernel function doesn't check if pixels are within the bounds of the destination texture, make sure texture width
// and height are multiples of the pipeline threadExecutionWidth and (threadExecutionWidth / maxTotalThreadsPerThreadgroup) respectivly.
MTLTextureDescriptor *textureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatBGRA8Unorm
width:2048
height:2048
mipmapped:NO];
// Create source texture
textureDescriptor.usage = MTLTextureUsageShaderRead;
_srcTexture = [_device newTextureWithDescriptor:textureDescriptor];
// Create description texture
textureDescriptor.usage = MTLTextureUsageShaderWrite | MTLTextureUsageRenderTarget;
_dstTexture = [_device newTextureWithDescriptor:textureDescriptor];
// Set the compute kernel's threadgroup size
NSUInteger threadWidth = _computePipelineState.threadExecutionWidth;
NSUInteger threadMax = _computePipelineState.maxTotalThreadsPerThreadgroup;
_threadgroupSize = MTLSizeMake(threadWidth, threadMax / threadWidth, 1);
// Set the compute kernel's threadgroup count
_threadgroupCount.width = (_srcTexture.width + _threadgroupSize.width - 1) / _threadgroupSize.width;
_threadgroupCount.height = (_srcTexture.height + _threadgroupSize.height - 1) / _threadgroupSize.height;
_threadgroupCount.depth = 1;
}