为啥这个微不足道的 opengl 程序使用 100% cpu?

Posted

技术标签:

【中文标题】为啥这个微不足道的 opengl 程序使用 100% cpu?【英文标题】:Why this trivial opengl program uses 100% cpu?为什么这个微不足道的 opengl 程序使用 100% cpu? 【发布时间】:2021-04-21 13:41:51 【问题描述】:

以下 opengl 代码专门设计为非常依赖 GPU,这会迫使 CPU 在 GPU 完成工作时等待一段时间。特别是,它在glFinish() 调用中执行此操作,在该调用中,CPU 等待每帧时间的测量时间为 99.87%。该程序在我的系统(Windows 10,gtx1070)上以 10 fps 的速度运行,并且禁用了 Vsync。这一切都在意料之中,如果不是因为 CPU 应该等待的时候,它莫名其妙地使用了 100% 的 CPU 时间,导致过热。

在 6 个使用 intel gpus、4 个使用 amd gpus 和 5 个使用 nvidia gpus 的系统上测试后,只有那些使用 nvidia gpus 的系统有问题。到目前为止,我所能得出的结论是,这个问题是特定于 nvidia 和特定于 opengl 的。 Directx 应用程序没有显示该问题,事实上,可以通过在禁用 ANGLE 的情况下运行 gpu-maxing webgl 页面在 Firefox 上重现此问题(启用角度时不会发生)。

我编译如下:

C:\mingw64\bin\x86_64-w64-mingw32-gcc.exe %~dp0\main.c -o %~dp0\main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:\mingw64\bin\ -LC:\mingw64\lib\ -IC:\mingw64\include\ -lgdi32 -lopengl32

最少的代码(我建议调整片段着色器,使您也达到 10 fps 左右,使问题更加明显):

#include <windows.h>
#include <GL/gl.h>

typedef signed long long int GLsizeiptr;
typedef char GLchar;

#define GL_ARRAY_BUFFER     0x8892
#define GL_DYNAMIC_DRAW     0x88E8
#define GL_FRAGMENT_SHADER  0x8B30
#define GL_VERTEX_SHADER    0x8B31

LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) 
    return DefWindowProc(hwnd, msg, wParam, lParam);


int main() 
    HDC hdc;
    
        WNDCLASS wc;
        memset(&wc, 0, sizeof(wc));
        wc.style         = CS_OWNDC;
        wc.lpfnWndProc   = WndProc;
        wc.lpszClassName = "gldemo";
        if (!RegisterClass(&wc)) return 0;

        HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
        hdc = GetDC(hwnd);
        
        const PIXELFORMATDESCRIPTOR pfd = 0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
        SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
        wglMakeCurrent(hdc, wglCreateContext(hdc));
        
        ShowWindow(hwnd, SW_SHOW);
    
    
    void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
    void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
    void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
    GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
    void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
    void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
    void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
    void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
    GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
    void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
    GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
    void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
    void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
    
    const char *g_vertCode =
        "#version 420\n"
        "in vec3 vertexPosition;"
        "void main() gl_Position = vec4(vertexPosition.xyz, 1.);";
        
    const char *g_fragCode =
        "#version 420\n"
        "void main() "
            "float res = 0.5;"
            "for (int t=0;t<58000;t++) " // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070. 
                "res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
            ""
            "gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
        "";
    
    GLuint prog = glCreateProgram();
    
    GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertshader, 1, &g_vertCode, 0);
    glCompileShader(vertshader);
    glAttachShader(prog, vertshader);
    
    GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragshader, 1, &g_fragCode, 0);
    glCompileShader(fragshader);
    glAttachShader(prog, fragshader);
    
    glLinkProgram(prog);
    glUseProgram(prog);
    
    GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
    
    float verts[4*3] = 1.0f, 1.0f, 0.0f,  -1.0f, 1.0f, 0.0f,  1.0f, -1.0f, 0.0f,  -1.0f, -1.0f, 0.0f;
    
    GLuint vboId;
    glGenBuffers(1, &vboId);
    glBindBuffer(GL_ARRAY_BUFFER, vboId);
    glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
    glEnableVertexAttribArray(attribVertexPosition);
    glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
    
    for (;;) 
        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
        //__asm__("int $3");
        glFinish(); // Halts here with 100% cpu on my system.
        SwapBuffers(hdc);
        
        MSG msg; char done = 0;
        while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) 
            if (msg.message == WM_QUIT) done = 1;
            TranslateMessage(&msg);
            DispatchMessage(&msg);
        
        if (done) break;
    
    
    return 0;

【问题讨论】:

尝试使用Sleep()禁用垂直同步并限制FPS。 您确定 VSYNC 处于活动状态吗?某些驱动程序可以配置为覆盖应用程序的 VSYNC 设置。使用我的 Nvidia 驱动程序,在 Nvidia 控制面板的“管理 3D 设置”下,您可以将“垂直同步”设置为“使用 3D 应用程序设置”或强制将其设置为“开”或“关”,这将覆盖应用程序的设置。 @AndreasWenzel 我可以用微星加力来判断。 gpu 可以以 400 fps 的速度运行程序,所以如果 afterburner 说它只有 120,等于我的显示器的刷新率,那么它就处于活动状态。我完全使用了您描述的 nvidia 驱动程序方法来在测试时打开/关闭 vsync,因为它运行良好。 @tuket OP 似乎没有使用 GLFW。 @user2464424 我的印象是,某些驱动器上的SwapBuffers() 使用繁忙的循环来等待正确的交换时机。您可以尝试估计它要等待的时间,然后在调用它之前睡眠稍微少一点(例如少 1 毫秒)。 【参考方案1】:

这不是答案,所以我不会将其标记为答案,但我现在可以提供一种解决方法。

这个想法是 glFlush 函数保证 gpu 已经开始处理,所以我们可以在纸面上,在 glFlushglFinish 之前插入等待以减少时间 @987654324 @花在煎炸cpu上。 这只有在等待函数的分辨率最差为 1ms 时才可行。如果您使用Sleep,或Windows API 中具有可调整超时的任何其他函数,则可以通过在程序开始时调用timeBeginPeriod(1) 来设置。 为确保等待不会过冲,这会导致 gpu 空闲,用于在下一帧等待的公式是等待刚刚花费的时间加上在glFinish 上花费的剩余时间减去 0.5 毫秒。

所有这一切的结果是,根据任务管理器,gpu 保持在 100% 的使用率,但 cpu 使用率只有在 vanilla glFinish 花费的时间低于 4 毫秒时才开始显着增加如果它低于大约 1.5 毫秒,该机制就会完全脱离。 在实际实现中,可能会考虑在定时区域内插入SwapBuffers 调用,以便在计算中包含可能存在的任何帧限制系统,并且可能使用Waitable Timers 而不是Sleep

编译:

C:\mingw64\bin\x86_64-w64-mingw32-gcc.exe %~dp0\main.c -o %~dp0\main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:\mingw64\bin\ -LC:\mingw64\lib\ -IC:\mingw64\include\ -lgdi32 -lopengl32 -lwinmm

更新代码:

#include <windows.h>
#include <stdio.h>
#include <GL/gl.h>

typedef signed long long int GLsizeiptr;
typedef char GLchar;

#define GL_ARRAY_BUFFER     0x8892
#define GL_DYNAMIC_DRAW     0x88E8
#define GL_FRAGMENT_SHADER  0x8B30
#define GL_VERTEX_SHADER    0x8B31

LARGE_INTEGER Frequency;
unsigned long long int elapsedMicroseconds(LARGE_INTEGER StartingTime) 
    LARGE_INTEGER EndingTime;
    QueryPerformanceCounter(&EndingTime);
    return (1000000*(EndingTime.QuadPart - StartingTime.QuadPart))/Frequency.QuadPart;


LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) 
    return DefWindowProc(hwnd, msg, wParam, lParam);


int main() 
    QueryPerformanceFrequency(&Frequency);
    timeBeginPeriod(1);
    
    HDC hdc;
    
        WNDCLASS wc;
        memset(&wc, 0, sizeof(wc));
        wc.style         = CS_OWNDC;
        wc.lpfnWndProc   = WndProc;
        wc.lpszClassName = "gldemo";
        if (!RegisterClass(&wc)) return 0;

        HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
        hdc = GetDC(hwnd);
        
        const PIXELFORMATDESCRIPTOR pfd = 0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;
        SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
        wglMakeCurrent(hdc, wglCreateContext(hdc));
        
        ShowWindow(hwnd, SW_SHOW);
    
    
    void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
    void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
    void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
    GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
    void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
    void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
    void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
    void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
    GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
    void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
    GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
    void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
    void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
    
    const char *g_vertCode =
        "#version 420\n"
        "in vec3 vertexPosition;"
        "void main() gl_Position = vec4(vertexPosition.xyz, 1.);";
        
    const char *g_fragCode =
        "#version 420\n"
        "void main() "
            "float res = 0.5;"
            "for (int t=0;t<58000;t++) " // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070. 
                "res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
            ""
            "gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
        "";
    
    GLuint prog = glCreateProgram();
    
    GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertshader, 1, &g_vertCode, 0);
    glCompileShader(vertshader);
    glAttachShader(prog, vertshader);
    
    GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragshader, 1, &g_fragCode, 0);
    glCompileShader(fragshader);
    glAttachShader(prog, fragshader);
    
    glLinkProgram(prog);
    glUseProgram(prog);
    
    GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
    
    float verts[4*3] = 1.0f, 1.0f, 0.0f,  -1.0f, 1.0f, 0.0f,  1.0f, -1.0f, 0.0f,  -1.0f, -1.0f, 0.0f;
    
    GLuint vboId;
    glGenBuffers(1, &vboId);
    glBindBuffer(GL_ARRAY_BUFFER, vboId);
    glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
    glEnableVertexAttribArray(attribVertexPosition);
    glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
    
    LARGE_INTEGER syncer;
    long long int waitfor = 0;
    
    for (;;) 
        //__asm__("int $3");
        
        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
        glFlush();
        
        QueryPerformanceCounter(&syncer);
            if (waitfor>0) Sleep(waitfor/1000);
            glFinish();
        waitfor = elapsedMicroseconds(syncer)-500;
        
        SwapBuffers(hdc);
        
        MSG msg; char done = FALSE;
        while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) 
            if (msg.message == WM_QUIT) done = TRUE;
            TranslateMessage(&msg);
            DispatchMessage(&msg);
        
        if (done) break;
    
    
    return 0;

【讨论】:

以上是关于为啥这个微不足道的 opengl 程序使用 100% cpu?的主要内容,如果未能解决你的问题,请参考以下文章

为啥这个 OpenGL ES 代码不会创建纹理?

为啥调用 glCreateShader 时这个 OpenGL 着色器分段错误?

超过 128MB 的纹理时出现 OpenGL“内存不足”错误

为啥这个在opengl中绘制三角形的着色器不会多次运行

确保释放 OpenGL 纹理内存

为啥这个 OpenGL ES 代码在 iPhone 上运行缓慢?