奇怪的计算着色器延迟
Posted
技术标签:
【中文标题】奇怪的计算着色器延迟【英文标题】:Weird compute shader latency 【发布时间】:2021-02-01 17:09:10 【问题描述】:我正在尝试通过计算着色器进行平截头体剔除。为此,我有一对用于实例化顶点属性的缓冲区,以及一对用于间接绘制命令的缓冲区。我的计算着色器检查来自第一个缓冲区的实例坐标是否在边界体积内,引用第一个绘制缓冲区的计数,subgroupBallot
和bitCount
以查看子组内的偏移量,然后添加来自其他子组的结果和全局偏移量,最后存储导致第二个缓冲区。全局偏移量存储在第二个间接绘制缓冲区中。
问题在于,在负载下,视锥体可能比移动相机晚几 (>1) 帧,边缘上消失的物体有很宽的线条。我觉得这很奇怪,因为剔除和渲染是在同一个命令缓冲区中完成的。
当在 renderdoc 中进行捕获、获取屏幕截图 alt+printScreen 或暂停呈现呈现线程时,一切都会恢复到应有的状态。
我唯一的猜测是,即使开始绘制新帧,过去帧的计算着色器也会继续执行,尽管由于管道障碍,这不应该发生。
着色器代码:
#version 460
#extension GL_KHR_shader_subgroup_ballot : require
struct drawData
uint indexCount;
uint instanceCount;
uint firstIndex;
uint vertexOffset;
uint firstInstance;
;
struct instanceData
float x, y, z;
float a, b, c, d;
;
layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0) uniform A
mat4 cam;
vec4 camPos;
vec4 l;
vec4 t;
vec4 r;
vec4 b;
;
layout(set = 0, binding = 1) buffer B
uint count;
drawData data[];
Draw[2];
layout(set = 0, binding = 2) buffer C
instanceData data[];
Instance[2];
shared uint offsetsM[32];
void main()
const uint gID = gl_LocalInvocationID.x;
const uint lID = gl_SubgroupInvocationID;
const uint patchSize = gl_WorkGroupSize.x;
Draw[1].data[0] = Draw[0].data[0];//copy data like index count
Draw[1].count = Draw[0].count;
uint offsetG = 0;//accumulating offset within end buffer
uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
for(uint i = 0; i<loops;++i)
uint posa = i*patchSize+gID;//runs better this way for some reason
vec3 pos = camPos.xyz-vec3(Instance[0].data[posa].x, Instance[0].data[posa].y, Instance[0].data[posa].z);//position relative to camera
mat4x3 lrtb = mat4x3(l.xyz, r.xyz, t.xyz, b.xyz);
vec4 dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
bool Pass = posa<Draw[0].data[0].instanceCount&&//is real
(dot(pos, pos)<l.w*l.w) &&//not too far
all(greaterThan(dist, vec4(0))); //within view frustum
subgroupBarrier();//no idea what is the best, put what works
uvec4 actives = subgroupBallot(Pass);//count passed instances
if(subgroupElect())
offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
barrier();
uint offsetL = bitCount(actives&gl_SubgroupLtMask).x+bitCount(actives&gl_SubgroupLtMask).y;//offset withing subgroup
uint ii = 0;
if(Pass)
for(; ii<gl_SubgroupID; ++ii)
offsetG+= offsetsM[ii];//offsets before subgroup
Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];//offsets after subgroup
else for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];//same but no data copying
if(gID == 0)
Draw[1].data[0].instanceCount = offsetG;
对于计算后的渲染通道,我有依赖项:
//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;
//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;
命令缓冲区(按原样完全重复使用,交换链中的每个图像一个):
vkBeginCommandBuffer(cmd, &begInfo);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layoutsPipe[1],
0, 1, &descs[1], 0, 0);
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipes[1]);
vkCmdDispatch(cmd, 1, 1, 1);
VkBufferMemoryBarrier bufMemBar[2];
//mem bars
//0 indirect
bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
bufMemBar[0].buffer = bufferIndirect;
bufMemBar[0].offset = 0;
bufMemBar[0].size = -1;
//1 vertex instance
bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
bufMemBar[1].buffer = bufferInstance;
bufMemBar[1].offset = 0;
bufMemBar[1].size = -1;
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, 0, 0, 1, &bufMemBar[0], 0, 0);
vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT , 0, 0, 0, 1, &bufMemBar[1], 0, 0);
VkRenderPassBeginInfo passBegInfo;
passBegInfo.renderPass = pass;
passBegInfo.framebuffer = chain.frames[i];
passBegInfo.renderArea = 0, 0, chain.dim;
VkClearValue clears[2]0,0;
passBegInfo.clearValueCount = 2;
passBegInfo.pClearValues = clears;
vkCmdBeginRenderPass(cmd, &passBegInfo, VK_SUBPASS_CONTENTS_INLINE);
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, layoutsPipe[0], 0, 1, &descs[0], 0, 0);
vkCmdBindPipeline (cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipes[0]);
VkBuffer buffersVertex[2]bufferVertexProto, bufferInstance;
VkDeviceSize offsetsVertex[2]0, 0;
vkCmdBindVertexBuffers(cmd, 0, 2, buffersVertex, offsetsVertex);
vkCmdBindIndexBuffer (cmd, bufferIndex, 0, VK_INDEX_TYPE_UINT32);
vkCmdDrawIndexedIndirectCount(cmd, bufferIndirect, 0+4,
bufferIndirect, 0,
count.maxDraws, sizeof(VkDrawIndexedIndirectCommand));
vkCmdEndRenderPass(cmd);
vkEndCommandBuffer(cmd);
渲染和呈现与两个信号量同步 - imageAvailable 和 renderFinished。截锥体计算在 CPU 上的顺序正确。验证层已启用。
【问题讨论】:
【参考方案1】:问题是我缺少主机同步。事实上,即使在同一个命令缓冲区中,也没有主机同步保证(这是有道理的,因为它使我们能够使用事件)。
【讨论】:
以上是关于奇怪的计算着色器延迟的主要内容,如果未能解决你的问题,请参考以下文章
使用制服时 Xamarin OpenGL 片段着色器的奇怪行为