为啥这个微不足道的 opengl 程序使用 100% cpu?
Posted
技术标签:
【中文标题】为啥这个微不足道的 opengl 程序使用 100% cpu?【英文标题】:Why this trivial opengl program uses 100% cpu?为什么这个微不足道的 opengl 程序使用 100% cpu? 【发布时间】:2021-04-21 13:41:51 【问题描述】:以下 opengl 代码专门设计为非常依赖 GPU,这会迫使 CPU 在 GPU 完成工作时等待一段时间。特别是,它在glFinish()
调用中执行此操作,在该调用中,CPU 等待每帧时间的测量时间为 99.87%。该程序在我的系统(Windows 10,gtx1070)上以 10 fps 的速度运行,并且禁用了 Vsync。这一切都在意料之中,如果不是因为 CPU 应该等待的时候,它莫名其妙地使用了 100% 的 CPU 时间,导致过热。
在 6 个使用 intel gpus、4 个使用 amd gpus 和 5 个使用 nvidia gpus 的系统上测试后,只有那些使用 nvidia gpus 的系统有问题。到目前为止,我所能得出的结论是,这个问题是特定于 nvidia 和特定于 opengl 的。 Directx 应用程序没有显示该问题,事实上,可以通过在禁用 ANGLE 的情况下运行 gpu-maxing webgl 页面在 Firefox 上重现此问题(启用角度时不会发生)。
我编译如下:
C:\mingw64\bin\x86_64-w64-mingw32-gcc.exe %~dp0\main.c -o %~dp0\main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:\mingw64\bin\ -LC:\mingw64\lib\ -IC:\mingw64\include\ -lgdi32 -lopengl32
最少的代码(我建议调整片段着色器,使您也达到 10 fps 左右,使问题更加明显):
#include <windows.h>
#include <GL/gl.h>
typedef signed long long int GLsizeiptr;
typedef char GLchar;
#define GL_ARRAY_BUFFER 0x8892
#define GL_DYNAMIC_DRAW 0x88E8
#define GL_FRAGMENT_SHADER 0x8B30
#define GL_VERTEX_SHADER 0x8B31
LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
return DefWindowProc(hwnd, msg, wParam, lParam);
int main()
HDC hdc;
WNDCLASS wc;
memset(&wc, 0, sizeof(wc));
wc.style = CS_OWNDC;
wc.lpfnWndProc = WndProc;
wc.lpszClassName = "gldemo";
if (!RegisterClass(&wc)) return 0;
HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
hdc = GetDC(hwnd);
const PIXELFORMATDESCRIPTOR pfd = 0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
wglMakeCurrent(hdc, wglCreateContext(hdc));
ShowWindow(hwnd, SW_SHOW);
void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
const char *g_vertCode =
"#version 420\n"
"in vec3 vertexPosition;"
"void main() gl_Position = vec4(vertexPosition.xyz, 1.);";
const char *g_fragCode =
"#version 420\n"
"void main() "
"float res = 0.5;"
"for (int t=0;t<58000;t++) " // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070.
"res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
""
"gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
"";
GLuint prog = glCreateProgram();
GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertshader, 1, &g_vertCode, 0);
glCompileShader(vertshader);
glAttachShader(prog, vertshader);
GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragshader, 1, &g_fragCode, 0);
glCompileShader(fragshader);
glAttachShader(prog, fragshader);
glLinkProgram(prog);
glUseProgram(prog);
GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
float verts[4*3] = 1.0f, 1.0f, 0.0f, -1.0f, 1.0f, 0.0f, 1.0f, -1.0f, 0.0f, -1.0f, -1.0f, 0.0f;
GLuint vboId;
glGenBuffers(1, &vboId);
glBindBuffer(GL_ARRAY_BUFFER, vboId);
glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
glEnableVertexAttribArray(attribVertexPosition);
glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
for (;;)
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
//__asm__("int $3");
glFinish(); // Halts here with 100% cpu on my system.
SwapBuffers(hdc);
MSG msg; char done = 0;
while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE))
if (msg.message == WM_QUIT) done = 1;
TranslateMessage(&msg);
DispatchMessage(&msg);
if (done) break;
return 0;
【问题讨论】:
尝试使用Sleep()
禁用垂直同步并限制FPS。
您确定 VSYNC 处于活动状态吗?某些驱动程序可以配置为覆盖应用程序的 VSYNC 设置。使用我的 Nvidia 驱动程序,在 Nvidia 控制面板的“管理 3D 设置”下,您可以将“垂直同步”设置为“使用 3D 应用程序设置”或强制将其设置为“开”或“关”,这将覆盖应用程序的设置。
@AndreasWenzel 我可以用微星加力来判断。 gpu 可以以 400 fps 的速度运行程序,所以如果 afterburner 说它只有 120,等于我的显示器的刷新率,那么它就处于活动状态。我完全使用了您描述的 nvidia 驱动程序方法来在测试时打开/关闭 vsync,因为它运行良好。
@tuket OP 似乎没有使用 GLFW。
@user2464424 我的印象是,某些驱动器上的SwapBuffers()
使用繁忙的循环来等待正确的交换时机。您可以尝试估计它要等待的时间,然后在调用它之前睡眠稍微少一点(例如少 1 毫秒)。
【参考方案1】:
这不是答案,所以我不会将其标记为答案,但我现在可以提供一种解决方法。
这个想法是 glFlush
函数保证 gpu 已经开始处理,所以我们可以在纸面上,在 glFlush
和 glFinish
之前插入等待以减少时间 @987654324 @花在煎炸cpu上。
这只有在等待函数的分辨率最差为 1ms 时才可行。如果您使用Sleep
,或Windows API 中具有可调整超时的任何其他函数,则可以通过在程序开始时调用timeBeginPeriod(1)
来设置。
为确保等待不会过冲,这会导致 gpu 空闲,用于在下一帧等待的公式是等待刚刚花费的时间加上在glFinish
上花费的剩余时间减去 0.5 毫秒。
所有这一切的结果是,根据任务管理器,gpu 保持在 100% 的使用率,但 cpu 使用率只有在 vanilla glFinish
花费的时间低于 4 毫秒时才开始显着增加如果它低于大约 1.5 毫秒,该机制就会完全脱离。
在实际实现中,可能会考虑在定时区域内插入SwapBuffers
调用,以便在计算中包含可能存在的任何帧限制系统,并且可能使用Waitable Timers
而不是Sleep
。
编译:
C:\mingw64\bin\x86_64-w64-mingw32-gcc.exe %~dp0\main.c -o %~dp0\main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:\mingw64\bin\ -LC:\mingw64\lib\ -IC:\mingw64\include\ -lgdi32 -lopengl32 -lwinmm
更新代码:
#include <windows.h>
#include <stdio.h>
#include <GL/gl.h>
typedef signed long long int GLsizeiptr;
typedef char GLchar;
#define GL_ARRAY_BUFFER 0x8892
#define GL_DYNAMIC_DRAW 0x88E8
#define GL_FRAGMENT_SHADER 0x8B30
#define GL_VERTEX_SHADER 0x8B31
LARGE_INTEGER Frequency;
unsigned long long int elapsedMicroseconds(LARGE_INTEGER StartingTime)
LARGE_INTEGER EndingTime;
QueryPerformanceCounter(&EndingTime);
return (1000000*(EndingTime.QuadPart - StartingTime.QuadPart))/Frequency.QuadPart;
LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam)
return DefWindowProc(hwnd, msg, wParam, lParam);
int main()
QueryPerformanceFrequency(&Frequency);
timeBeginPeriod(1);
HDC hdc;
WNDCLASS wc;
memset(&wc, 0, sizeof(wc));
wc.style = CS_OWNDC;
wc.lpfnWndProc = WndProc;
wc.lpszClassName = "gldemo";
if (!RegisterClass(&wc)) return 0;
HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
hdc = GetDC(hwnd);
const PIXELFORMATDESCRIPTOR pfd = 0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
wglMakeCurrent(hdc, wglCreateContext(hdc));
ShowWindow(hwnd, SW_SHOW);
void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
const char *g_vertCode =
"#version 420\n"
"in vec3 vertexPosition;"
"void main() gl_Position = vec4(vertexPosition.xyz, 1.);";
const char *g_fragCode =
"#version 420\n"
"void main() "
"float res = 0.5;"
"for (int t=0;t<58000;t++) " // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070.
"res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
""
"gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
"";
GLuint prog = glCreateProgram();
GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertshader, 1, &g_vertCode, 0);
glCompileShader(vertshader);
glAttachShader(prog, vertshader);
GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragshader, 1, &g_fragCode, 0);
glCompileShader(fragshader);
glAttachShader(prog, fragshader);
glLinkProgram(prog);
glUseProgram(prog);
GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
float verts[4*3] = 1.0f, 1.0f, 0.0f, -1.0f, 1.0f, 0.0f, 1.0f, -1.0f, 0.0f, -1.0f, -1.0f, 0.0f;
GLuint vboId;
glGenBuffers(1, &vboId);
glBindBuffer(GL_ARRAY_BUFFER, vboId);
glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
glEnableVertexAttribArray(attribVertexPosition);
glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
LARGE_INTEGER syncer;
long long int waitfor = 0;
for (;;)
//__asm__("int $3");
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glFlush();
QueryPerformanceCounter(&syncer);
if (waitfor>0) Sleep(waitfor/1000);
glFinish();
waitfor = elapsedMicroseconds(syncer)-500;
SwapBuffers(hdc);
MSG msg; char done = FALSE;
while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE))
if (msg.message == WM_QUIT) done = TRUE;
TranslateMessage(&msg);
DispatchMessage(&msg);
if (done) break;
return 0;
【讨论】:
以上是关于为啥这个微不足道的 opengl 程序使用 100% cpu?的主要内容,如果未能解决你的问题,请参考以下文章
为啥调用 glCreateShader 时这个 OpenGL 着色器分段错误?