当我尝试增加矩阵大小时，在 AMD openCL/C 中实现矩阵向量乘法会导致系统死机

Posted 2023-02-16

技术标签:

【中文标题】当我尝试增加矩阵大小时，在 AMD openCL/C 中实现矩阵向量乘法会导致系统死机【英文标题】：Implementing matrix-vector multiplication in AMD openCL/C produces system freezes when I try to increase matrix size 【发布时间】：2012-11-13 14:51:43 【问题描述】：

对于大学的一个项目，我正在使用 AMD OpenCL 实现矩阵向量乘法。我使用的机器是运行 Ubuntu 12.04 的全新台式机，配备 Radeon HD 7970 和 AMD FX-4100 四核处理器。我 AMD APP 1.2 和 Radeon 的最新 ATI Catalyst 驱动程序。这是我正在尝试使用的内核。

__kernel void mvKernel(__global float* a, const __global float* x, __global float* y, int m, int n)

float sum = 0.0f;
 __global float* A;
int i;
int j = 0;
int indx = get_global_id(0);
__local float xs[2048];
for(i = get_local_id(0); i < n; i+= get_local_size(0)) 
    xs[i] = x[i];
 
mem_fence(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);
A = &a[indx];
for(i = 0; i < n; i++) 
    sum += xs[i] * A[j];
    j += m;

y[indx] = sum;

在 GPU 上针对 256 x 256 的矩阵大小运行此程序时，生成的结果是正确的，不会出现任何问题。但是，当我尝试增加作为命令行参数给出的矩阵大小时，系统将挂起，需要重新启动。但是，当我使用 AMD 的 CodeXL 调试器/分析器运行代码时，代码大部分时间都会运行，没有错误。这是我运行的主机代码

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>
#include <string.h>

char* readSource(const char* sourceFilename);

void randomInit(float* data, int size)

int i =0;
for(i; i < size; i++)
    data[i] = (rand()/(float)RAND_MAX) * 10;


void cpuMV (float* y, float* A, float* X, int M, int N)

for(int i = 0; i< M; i++) 
    double sum = 0;
    y[i] = 0;
    for(int k = 0; k < N; k++) 
        double a = A[i + k* M];
        double x = X[k];
        sum += a * x;
    
    y[i] = (float) sum;
 


int main( int argc, char ** argv) 
int M = atoi(argv[1]);//1024;
int N = atoi(argv[2]);//1024;
float *A, *x;
float *y;
A = (float *)malloc(sizeof(float) * M * N);
x = (float *)malloc(sizeof(float) * N);
y = (float *)malloc(sizeof(float) * M);
randomInit(A, M * N);
randomInit(x, N);
int wrong;
wrong = 0;  
cl_int err;
cl_uint numPlatforms;
cl_platform_id *platforms;

err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS) 
    printf("clGetPlatformIDs failed\n");
    exit(-1);


if(numPlatforms == 0) 
    printf("No platforms detected.\n");
    exit(-1);   

platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

clGetPlatformIDs(numPlatforms, platforms, NULL);

printf("%u platforms found\n", numPlatforms);
for(int i =0; i < numPlatforms; i++) 
    char buff[100];
    printf("Platform %u:\n", i);
    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buff), buff, NULL);
    printf("\tVendor: %s\n", buff);
    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(buff), buff, NULL);
    printf("\tName: %s\n", buff);
    if (err != CL_SUCCESS) 
        printf("clGetPlatformInfo failed\n");
        exit(-1);
    

printf("\n");

cl_uint numDevices = 0;
cl_device_id *devices;
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if(err != CL_SUCCESS) 
    printf("clGetDeviceIDs failed\n");
    exit(-1);

if (numDevices == 0)
    printf("No devices found\n");
    exit(-1);

devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
printf("%u devices found\n", numDevices);
for(int i =0; i < numDevices; i++) 
    char buff[100];
    printf("Device %u:\n", i);
    err = clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buff), buff, NULL);
    printf("\tVendor: %s\n", buff);
    err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buff), buff, NULL);
    printf("\tName: %s\n", buff);
    if (err != CL_SUCCESS) 
        printf("clGetDeviceInfo failed\n");
        exit(-1);
    

cl_context context;
context = clCreateContext(NULL, numDevices,devices, NULL, NULL, &err);
if(err != CL_SUCCESS)
    printf("clCreateContext failed\n");
    exit(-1);


cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &err);
if(err != CL_SUCCESS)  
    printf("clCreateCommandQueue failed\n");
    exit(-1);


cl_mem d_A, d_x;
cl_mem d_y;
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, M * N * sizeof(float), A, &err);
if (err != CL_SUCCESS) 
    printf("clCreateBuffer for A failed\n");
    exit(-1);

d_x = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,  N * sizeof(float), x, &err);
if (err != CL_SUCCESS) 
    printf("clCreateBuffer for x failed\n");
    exit(-1);

d_y = clCreateBuffer(context, CL_MEM_READ_WRITE, M * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) 
    printf("clCreateBuffer for y failed\n");
    exit(-1);

cl_program program;
char* source;
const char *sourceFile = "MVM_Kernel2.cl";
source = readSource(sourceFile);
program = clCreateProgramWithSource(context, 1, (const char**) &source, NULL, &err);
if (err != CL_SUCCESS) 
    printf("clCreateProgramFailed");
    exit(-1);

cl_int buildErr;
buildErr = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if (buildErr != CL_SUCCESS) 
    printf("Program failed to build,\n");
    cl_build_status buildStatus;
    for(int i = 0; i < numDevices; i++) 
        clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
        if(buildStatus == CL_SUCCESS) 
            continue;
        
        char *buildLog;
        size_t buildLogSize;
        clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize);
        buildLog = (char *)malloc(buildLogSize);
        clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,buildLogSize, buildLog, NULL);
        buildLog[buildLogSize -1] = '\0';
        printf("Device %u Build Log:\n%s\n", i, buildLog);
        free(buildLog);
    
    exit(0);

else 
    printf("No build errors\n");


cl_kernel kernel;
kernel = clCreateKernel(program, "mvKernel", &err);
if(err != CL_SUCCESS) 
    printf("clCreateKernel failed\n");
    exit(-1);

err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_A);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_x);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_y);
err |= clSetKernelArg(kernel, 3, sizeof(int), &M);
err |= clSetKernelArg(kernel, 4, sizeof(int), &N);

size_t globalWorkSize[1];
globalWorkSize[0] = M * N;
size_t localWorkSize[1];
localWorkSize[0] = 256;

err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clEnqueueReadBuffer(cmdQueue, d_y, CL_TRUE, 0, M * sizeof(float), y, 0, NULL, NULL);
clFlush(cmdQueue);
err = clFinish(cmdQueue);
if(err != CL_SUCCESS) 
    printf("ERROR!!");
    exit(-1);

clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(d_A);
clReleaseMemObject(d_x);
clReleaseMemObject(d_y);
clReleaseContext(context);
for(int i=0; i < (M <10 ? M : 10); i++)
    printf("vector y = %f\n", y[i]);
float* refY;
refY = (float*)malloc(M*sizeof(float));
cpuMV(refY, A, x, M, N);
for (int i = 0; i < M; ++i) 
    float diff = refY[i] - y[i];
    if (fabsf(diff)/ refY[i] > 1e-4)
        wrong++;

printf("There were %d errors!!\n", wrong);
free(A);
free(y);
free(x);
free(source);
free(platforms);    
free(devices);


char* readSource(const char *sourceFilename) 
FILE *fp;
int errs;
int size;
char *source;
fp = fopen(sourceFilename, "rb");
errs = fseek(fp, 0, SEEK_END);
if(errs != 0) 
    printf("Error seeking to end of file");
    exit(-1);

size = ftell(fp);
if(size<0) 
    printf("Errror getting file position");
    exit(-1);

errs = fseek(fp, 0, SEEK_SET);
if(errs != 0)
    printf("Error seeking to start of file\n");
    exit(-1);

source = (char*)malloc(size +1);
errs = fread(source, 1, size, fp);
if(errs != size) 
    printf("only read %d bytes\n", errs);
    exit(0);

source[size]= '\0';
return source;

最终这需要处理约 10000 阶的矩阵编辑我还在我的笔记本电脑上尝试了相同的代码，它有一个 Nvidia GT525m，该程序运行良好，适用于高达 352 * 352 的矩阵，任何更大的矩阵都将是零，但它不会崩溃。

【问题讨论】：

那么，超过 256*256 就崩溃了？还有 CPU 吗？超过 256*256 从命令行运行时会崩溃，但在 codeXL 调试器中通常可以正常工作。我还没有尝试在 CPU 上运行它。我想我已经解决了这个问题，我的 globalWorkSize 太大了，它应该是 M 而不是 M*N 因此我认为它崩溃了 【参考方案1】：

问题在于 globalWorkSize 太大 (M * N)，而它本应只有 M。这一定是 GPU 过载并导致系统冻结。我现在可以在 Nvidia 和 AMD GPU 以及 AMD CPU 上可靠运行代码

【讨论】：

以上是关于当我尝试增加矩阵大小时，在 AMD openCL/C 中实现矩阵向量乘法会导致系统死机的主要内容，如果未能解决你的问题，请参考以下文章