奇怪的计算着色器延迟

Posted

技术标签:

【中文标题】奇怪的计算着色器延迟【英文标题】:Weird compute shader latency 【发布时间】:2021-02-01 17:09:10 【问题描述】:

我正在尝试通过计算着色器进行平截头体剔除。为此,我有一对用于实例化顶点属性的缓冲区,以及一对用于间接绘制命令的缓冲区。我的计算着色器检查来自第一个缓冲区的实例坐标是否在边界体积内,引用第一个绘制缓冲区的计数,subgroupBallotbitCount 以查看子组内的偏移量,然后添加来自其他子组的结果和全局偏移量,最后存储导致第二个缓冲区。全局偏移量存储在第二个间接绘制缓冲区中。

问题在于,在负载下,视锥体可能比移动相机晚几 (>1) 帧,边缘上消失的物体有很宽的线条。我觉得这很奇怪,因为剔除和渲染是在同一个命令缓冲区中完成的。

当在 renderdoc 中进行捕获、获取屏幕截图 alt+printScreen 或暂停呈现呈现线程时,一切都会恢复到应有的状态。

我唯一的猜测是,即使开始绘制新帧,过去帧的计算着色器也会继续执行,尽管由于管道障碍,这不应该发生。

着色器代码:

#version 460

#extension GL_KHR_shader_subgroup_ballot : require

struct drawData
    uint indexCount;
    uint instanceCount;
    uint firstIndex;
    uint vertexOffset;
    uint firstInstance;
;

struct instanceData
    float x, y, z;
    float a, b, c, d;
;

layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;

layout(set = 0, binding = 0) uniform A

    mat4 cam;
    vec4 camPos;
    vec4 l;
    vec4 t;
    vec4 r;
    vec4 b;
;

layout(set = 0, binding = 1) buffer B

    uint count;
    drawData data[];
 Draw[2];

layout(set = 0, binding = 2) buffer C

    instanceData data[];
 Instance[2];

shared uint offsetsM[32];

void main()

    const uint gID = gl_LocalInvocationID.x;
    const uint lID = gl_SubgroupInvocationID;
    const uint patchSize = gl_WorkGroupSize.x;
        Draw[1].data[0] = Draw[0].data[0];//copy data like index count
        
    Draw[1].count = Draw[0].count;
    
    uint offsetG = 0;//accumulating offset within end buffer
    
    uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
    for(uint i = 0; i<loops;++i)
        uint posa = i*patchSize+gID;//runs better this way for some reason
        
        vec3   pos  = camPos.xyz-vec3(Instance[0].data[posa].x, Instance[0].data[posa].y, Instance[0].data[posa].z);//position relative to camera
        mat4x3 lrtb = mat4x3(l.xyz, r.xyz, t.xyz, b.xyz);
        vec4   dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
        bool   Pass = posa<Draw[0].data[0].instanceCount&&//is real
                     (dot(pos, pos)<l.w*l.w)            &&//not too far
                  all(greaterThan(dist, vec4(0)));        //within view frustum
        
        subgroupBarrier();//no idea what is the best, put what works
        uvec4 actives = subgroupBallot(Pass);//count passed instances
        if(subgroupElect())
            offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
        barrier();
        
            uint offsetL = bitCount(actives&gl_SubgroupLtMask).x+bitCount(actives&gl_SubgroupLtMask).y;//offset withing subgroup
            uint ii = 0;
        if(Pass)
             for(; ii<gl_SubgroupID; ++ii)
                 offsetG+= offsetsM[ii];//offsets before subgroup
             Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
             for(; ii<gl_NumSubgroups; ++ii)
                 offsetG+= offsetsM[ii];//offsets after subgroup
        else for(; ii<gl_NumSubgroups; ++ii)
                 offsetG+= offsetsM[ii];//same but no data copying
    
    if(gID == 0)
        Draw[1].data[0].instanceCount = offsetG;

对于计算后的渲染通道,我有依赖项:

//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;

//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;

命令缓冲区(按原样完全重复使用,交换链中的每个图像一个):

vkBeginCommandBuffer(cmd, &begInfo);

    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layoutsPipe[1],
                            0, 1, &descs[1], 0, 0);
    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipes[1]);
    vkCmdDispatch(cmd, 1, 1, 1);

    VkBufferMemoryBarrier bufMemBar[2];
    //mem bars
        //0 indirect
            bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
            bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
            bufMemBar[0].buffer = bufferIndirect;
            bufMemBar[0].offset = 0;
            bufMemBar[0].size   = -1;
        
        //1 vertex instance
            bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
            bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
            bufMemBar[1].buffer = bufferInstance;
            bufMemBar[1].offset = 0;
            bufMemBar[1].size   = -1;
        
    
    vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                         VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, 0, 0, 1, &bufMemBar[0], 0, 0);
    vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                         VK_PIPELINE_STAGE_VERTEX_INPUT_BIT , 0, 0, 0, 1, &bufMemBar[1], 0, 0);

    VkRenderPassBeginInfo passBegInfo;
    passBegInfo.renderPass  = pass;
    passBegInfo.framebuffer = chain.frames[i];
    passBegInfo.renderArea  = 0, 0, chain.dim;
        VkClearValue clears[2]0,0;
    passBegInfo.clearValueCount = 2;
    passBegInfo.pClearValues    = clears;
vkCmdBeginRenderPass(cmd, &passBegInfo, VK_SUBPASS_CONTENTS_INLINE);
    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, layoutsPipe[0], 0, 1, &descs[0], 0, 0);
    vkCmdBindPipeline      (cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipes[0]);
        VkBuffer     buffersVertex[2]bufferVertexProto, bufferInstance;
        VkDeviceSize offsetsVertex[2]0, 0;
    vkCmdBindVertexBuffers(cmd, 0, 2, buffersVertex, offsetsVertex);
    vkCmdBindIndexBuffer  (cmd, bufferIndex, 0, VK_INDEX_TYPE_UINT32);

    vkCmdDrawIndexedIndirectCount(cmd, bufferIndirect, 0+4,
                                       bufferIndirect, 0,
                                  count.maxDraws, sizeof(VkDrawIndexedIndirectCommand));
vkCmdEndRenderPass(cmd);

vkEndCommandBuffer(cmd);

渲染和呈现与两个信号量同步 - imageAvailable 和 renderFinished。截锥体计算在 CPU 上的顺序正确。验证层已启用。

【问题讨论】:

【参考方案1】:

问题是我缺少主机同步。事实上,即使在同一个命令缓冲区中,也没有主机同步保证(这是有道理的,因为它使我们能够使用事件)。

【讨论】:

以上是关于奇怪的计算着色器延迟的主要内容,如果未能解决你的问题,请参考以下文章

FBO 的延迟着色器纹理显示为黑色

GLSL 计算着色器闪烁块/正方形伪影

使用制服时 Xamarin OpenGL 片段着色器的奇怪行为

OpenGL:为啥我不能将单个浮点数从顶点着色器传递到片段着色器?

使用两个着色器时的opengl奇怪行为

OpenGL顶点着色器:奇怪的矩阵转换