OpenCL 计算与顺序算法的输出不匹配

Posted 2023-02-22

技术标签:

【中文标题】OpenCL 计算与顺序算法的输出不匹配【英文标题】：OpenCL computation does not match output of sequential algorithm 【发布时间】：2014-10-27 22:40:40 【问题描述】：

我正在尝试在 OpenCL 中实现 LU 分解的简单版本。首先，我在 C++ 中实现了一个顺序版本，并构造了方法来验证我的结果（即乘法方法）。接下来，我在内核中实现了我的算法，并使用手动验证的输入（即 5x5 矩阵）对其进行了测试。这很好用。

但是，当我在大于 5x5 的随机生成矩阵上运行我的算法时，我得到了奇怪的结果。我已经清理了我的代码，手动检查了计算，但我无法弄清楚我的内核哪里出了问题。我开始认为这可能与floats 和计算的稳定性有关。我的意思是误差范围被传播并变得越来越大。我很清楚我可以交换行以获得最大的枢轴值等，但有时误差幅度会很大。在任何情况下，我都希望结果 - 尽管是错误的 - 与顺序算法相同。我需要一些帮助来确定我可能做错了什么。

我使用的是一维数组，因此使用二维矩阵寻址的过程如下：

A(row, col) = A[row * matrix_width + col].

关于结果，我可能会补充一点，我决定将 L 和 U 矩阵合并为一个。所以给定 L 和 U：

L:     U:
1 0 0  A B C
X 1 0  0 D E
Y Z 1  0 0 F

我将它们显示为：

A:
A B C
X D E
Y Z F

内核如下：

参数source是我要分解的原始矩阵。参数destin 是目的地。 matrix_size 是矩阵的总大小（对于 3x3 矩阵来说是 9），matrix_width 是宽度（对于 3x3 矩阵来说是 3）。

__kernel void matrix(
    __global float * source,
    __global float * destin,
    unsigned int   matrix_size,
    unsigned int   matrix_width
    )

    unsigned int index = get_global_id(0);
    int col_idx = index % matrix_width;
    int row_idx = index / matrix_width;

    if (index >= matrix_size)
        return;

    // First of all, copy our value to the destination.
    destin[index] = source[index];
    // Iterate over all the pivots.
    for(int piv_idx = 0; piv_idx < matrix_width; piv_idx++)
    
        // We have to be the row below the pivot row
        // And we have to be the column of the pivot
        // or right of that column.
        if(col_idx < piv_idx || row_idx <= piv_idx)
            return;
        // Calculate the divisor.
        float pivot_value       = destin[(piv_idx * matrix_width) + piv_idx];
        float below_pivot_value = destin[(row_idx * matrix_width) + piv_idx];
        float divisor           = below_pivot_value/ pivot_value;

        // Get the value in the pivot row on this column.
        float pivot_row_value = destin[(piv_idx * matrix_width) + col_idx];
        float current_value   = destin[index];
        destin[index]         = current_value - (pivot_row_value * divisor);


        // Write the divisor to the memory (we won't use these values anymore!)
        // if we are the value under the pivot.
        barrier(CLK_GLOBAL_MEM_FENCE);
        if(col_idx == piv_idx)
        
            int divisor_location = (row_idx * matrix_width) + piv_idx;
            destin[divisor_location] = divisor;
        
        barrier(CLK_GLOBAL_MEM_FENCE);

这是顺序版本：

// Decomposes a matrix into L and U but in the same matrix.
float * decompose(float* A, int matrix_width)

    int total_length = matrix_width*matrix_width;
    float *U = new float[total_length];

    for (int i = 0; i < total_length; i++)
    
        U[i] = A[i];
    
    for (int row = 0; row < matrix_width; row++)
    
        int pivot_idx = row;
        float pivot_val = U[pivot_idx * matrix_width + pivot_idx];

        for (int r = row + 1; r < matrix_width; r++)
        
            float below_pivot = U[r*matrix_width + pivot_idx];
            float divisor = below_pivot / pivot_val;

            for (int row_idx = pivot_idx; row_idx < matrix_width; row_idx++)
            
                float value = U[row * matrix_width + row_idx];
                U[r*matrix_width + row_idx] = U[r*matrix_width + row_idx] - (value * divisor);
            
            U[r * matrix_width + pivot_idx] = divisor;
        
    
    return U;

我得到的示例输出如下：

Workgroup size: 1
Array dimension: 6
Original unfactorized:
|     176.000000 |     133.000000 |     431.000000 |     839.000000 |     739.000000 |     450.000000 |
|     507.000000 |     718.000000 |     670.000000 |     753.000000 |     122.000000 |     941.000000 |
|     597.000000 |     449.000000 |     596.000000 |     742.000000 |     491.000000 |     212.000000 |
|     159.000000 |     944.000000 |     797.000000 |     717.000000 |     822.000000 |     219.000000 |
|     266.000000 |     755.000000 |      33.000000 |     231.000000 |     824.000000 |     785.000000 |
|     724.000000 |     408.000000 |     652.000000 |     863.000000 |     663.000000 |     113.000000 |
Sequential:
|     176.000000 |     133.000000 |     431.000000 |     839.000000 |     739.000000 |     450.000000 |
|       2.880682 |     334.869324 |    -571.573853 |   -1663.892090 |   -2006.823730 |    -355.306763 |
|       3.392045 |      -0.006397 |    -869.627747 |   -2114.569580 |   -2028.558716 |   -1316.693359 |
|       0.903409 |       2.460203 |      -2.085742 |    -357.893066 |     860.526367 |   -2059.689209 |
|       1.511364 |       1.654343 |      -0.376231 |      -2.570729 |    4476.049805 |   -5097.599121 |
|       4.113636 |      -0.415427 |       1.562076 |      -0.065806 |       0.003290 |      52.263515 |
Sequential multiplied matching with original?:
1
GPU:
|     176.000000 |     133.000000 |     431.000000 |     839.000000 |     739.000000 |     450.000000 |
|       2.880682 |     334.869293 |    -571.573914 |   -1663.892212 |   -2006.823975 |    -355.306885 |
|       3.392045 |      -0.006397 |    -869.627808 |   -2114.569580 |   -2028.558716 |   -1316.693359 |
|       0.903409 |       2.460203 |      -2.085742 |    -357.892578 |    5091.575684 |   -2059.688965 |
|       1.511364 |       1.654343 |      -0.376232 |      -2.570732 |   16116.155273 |   -5097.604980 |
|       4.113636 |      -0.415427 |      -0.737347 |       2.005755 |      -3.655331 |    -237.480438 |
GPU multiplied matching with original?:
Values differ: 5053.05 -- 822
0
Values differ: 5091.58 -- 860.526
Correct solution? 0

编辑

好的，我想我明白为什么它以前不起作用了。原因是我只在每个工作组上同步。当我使用等于矩阵中项目数的工作组大小来调用我的内核时，它总是正确的，因为这样障碍就会正常工作。但是，我决定采用 cmets 中提到的方法。将多个内核排入队列并等待每个内核完成，然后再启动下一个内核。然后，这将映射到矩阵每一行的迭代，并将其与枢轴元素相乘。这确保我不会修改或读取当时内核正在修改的元素。

同样，这适用于小矩阵。所以我认为我假设它只是同步是错误的。根据 Baiz 的要求，我在此处发布了调用内核的整个 main：

int main(int argc, char *argv[])

    try 
        if (argc != 5) 
            std::ostringstream oss;
            oss << "Usage: " << argv[0] << " <kernel_file> <kernel_name> <workgroup_size> <array width>";
            throw std::runtime_error(oss.str());
        
        // Read in arguments.
        std::string kernel_file(argv[1]);
        std::string kernel_name(argv[2]);
        unsigned int workgroup_size = atoi(argv[3]);
        unsigned int array_dimension = atoi(argv[4]);
        int total_matrix_length = array_dimension * array_dimension;
        // Print parameters
        std::cout << "Workgroup size: " << workgroup_size << std::endl;
        std::cout << "Array dimension: " << array_dimension << std::endl;

        // Create matrix to work on.
        // Create a random array.
        int matrix_width = sqrt(total_matrix_length);
        float* input_matrix = new float[total_matrix_length];
        input_matrix = randomMatrix(total_matrix_length);

        /// Debugging
        //float* input_matrix = new float[9];
        //int matrix_width = 3;
        //total_matrix_length = matrix_width * matrix_width;
        //input_matrix[0] = 10; input_matrix[1] = -7; input_matrix[2] = 0;
        //input_matrix[3] = -3; input_matrix[4] =  2; input_matrix[5] = 6;
        //input_matrix[6] =  5; input_matrix[7] = -1; input_matrix[8] = 5;

        // Allocate memory on the host and populate source
        float *gpu_result = new float[total_matrix_length];


        // OpenCL initialization
        std::vector<cl::Platform> platforms;
        std::vector<cl::Device> devices;
        cl::Platform::get(&platforms);
        platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
        cl::Context context(devices);
        cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE);

        // Load the kernel source.
        std::string file_text;
        std::ifstream file_stream(kernel_file.c_str());
        if (!file_stream) 
            std::ostringstream oss;
            oss << "There is no file called " << kernel_file;
            throw std::runtime_error(oss.str());
        
        file_text.assign(std::istreambuf_iterator<char>(file_stream), std::istreambuf_iterator<char>());

        // Compile the kernel source.
        std::string source_code = file_text;
        std::pair<const char *, size_t> source(source_code.c_str(), source_code.size());
        cl::Program::Sources sources;
        sources.push_back(source);
        cl::Program program(context, sources);
        try 
            program.build(devices);
        
        catch (cl::Error& e) 
            std::string msg;
            program.getBuildInfo<std::string>(devices[0], CL_PROGRAM_BUILD_LOG, &msg);
            std::cerr << "Your kernel failed to compile" << std::endl;
            std::cerr << "-----------------------------" << std::endl;
            std::cerr << msg;
            throw(e);
        

        // Allocate memory on the device
        cl::Buffer source_buf(context, CL_MEM_READ_ONLY, total_matrix_length*sizeof(float));
        cl::Buffer dest_buf(context, CL_MEM_WRITE_ONLY, total_matrix_length*sizeof(float));

        // Create the actual kernel.
        cl::Kernel kernel(program, kernel_name.c_str());

        // transfer source data from the host to the device
        queue.enqueueWriteBuffer(source_buf, CL_TRUE, 0, total_matrix_length*sizeof(float), input_matrix);

        for (int pivot_idx = 0; pivot_idx < matrix_width; pivot_idx++)
        
            // set the kernel arguments
            kernel.setArg<cl::Memory>(0, source_buf);
            kernel.setArg<cl::Memory>(1, dest_buf);
            kernel.setArg<cl_uint>(2, total_matrix_length);
            kernel.setArg<cl_uint>(3, matrix_width);
            kernel.setArg<cl_int>(4, pivot_idx);

            // execute the code on the device
            std::cout << "Enqueueing new kernel for " << pivot_idx << std::endl;
            cl::Event evt;
            queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(total_matrix_length), cl::NDRange(workgroup_size), 0, &evt);
            evt.wait();
            std::cout << "Iteration " << pivot_idx << " done" << std::endl;
        

        // transfer destination data from the device to the host
        queue.enqueueReadBuffer(dest_buf, CL_TRUE, 0, total_matrix_length*sizeof(float), gpu_result);

        // Calculate sequentially.
        float* sequential = decompose(input_matrix, matrix_width);

        // Print out the results.
        std::cout << "Sequential:\n";
        printMatrix(total_matrix_length, sequential);

        // Print out the results.
        std::cout << "GPU:\n";
        printMatrix(total_matrix_length, gpu_result);

        std::cout << "Correct solution? " << equalMatrices(gpu_result, sequential, total_matrix_length);


        // compute the data throughput in GB/s
        //float throughput = (2.0*total_matrix_length*sizeof(float)) / t; // t is in nano seconds
        //std::cout << "Achieved throughput: " << throughput << std::endl;

        // Cleanup
        // Deallocate memory
        delete[] gpu_result;
        delete[] input_matrix;
        delete[] sequential;
        return 0;
    
    catch (cl::Error& e) 
        std::cerr << e.what() << ": " << jc::readable_status(e.err());
        return 3;
    
    catch (std::exception& e) 
        std::cerr << e.what() << std::endl;
        return 2;
    
    catch (...) 
        std::cerr << "Unexpected error. Aborting!\n" << std::endl;
        return 1;

【问题讨论】：

我添加了一个可能的解决方案。如果可行，请告诉我。 【参考方案1】：

正如 maZZZu 所述，由于工作项的并行执行，您无法确定数组中的元素是否已被读取/写入。这可以使用

来确保

CLK_LOCAL_MEM_FENCE/CLK_GLOBAL_MEM_FENCE

但是，这些机制仅适用于同一工作组内的线程。无法同步来自不同工作组的工作项。

您的问题很可能是：

您将多个工作组用于最可能只能由单个工作组执行的算法你没有使用足够的障碍

如果您已经只使用一个工作组，请尝试添加一个

屏障（CLK_GLOBAL_MEM_FENCE）；

到您从/到目的地读/写的所有部分。

你应该重构你的算法：

只有一个工作组在您的矩阵上执行算法使用本地内存以获得更好的性能（因为您重复访问元素）在各处使用屏障。如果算法有效，您可以在解决后开始删除它们，哪些是您不需要的。

您能否发布您的内核调用和工作大小？

编辑：

根据您的算法，我想出了这段代码。我还没有测试过它，我怀疑它会立即起作用。但它应该可以帮助您理解如何并行化顺序算法。它将仅通过一次内核启动来分解矩阵。

一些限制：

此代码仅适用于单个工作组。它仅适用于大小不超过您的最大本地工作组大小（可能在 256 和 1024 之间）的矩阵。如果你想改变它，你应该重构算法，只使用与矩阵宽度一样多的工作项。

只需将它们调整为您的 kernel.setArg(...) 代码

int nbElements = width*height;
clSetKernelArg (kernel, 0, sizeof(A), &A);
clSetKernelArg (kernel, 1, sizeof(U), &U);
clSetKernelArg (kernel, 2, sizeof(float) * widthMat * heightMat, NULL); // Local memory
clSetKernelArg (kernel, 3, sizeof(int), &width);
clSetKernelArg (kernel, 4, sizeof(int), &height);
clSetKernelArg (kernel, 5, sizeof(int), &nbElements);

内核代码：

inline int indexFrom2d(const int u, const int v, const int width)

    return width*v + u;


kernel void decompose(global float* A, 
                      global float* U,
                      local float* localBuffer, 
                      const int widthMat,
                      const int heightMat,
                      const int nbElements)

    int gidx = get_global_id(0);
    int col = gidx%widthMat;
    int row = gidx/widthMat;

    if(gidx >= nbElements)
        return;

    // Copy from global to local memory
    localBuffer[gidx] = A[gidx];

    // Sync copy process
    barrier(CLK_LOCAL_MEM_FENCE);

    for (int rowOuter = 0; rowOuter < widthMat; ++rowOuter)
    
        int pivotIdx = rowOuter;

        float pivotValue = localBuffer[indexFrom2d(pivotIdx, pivotIdx, widthMat)];

        // Data for all work items in the row
        float belowPrivot = localBuffer[indexFrom2d(pivotIdx, row, widthMat)];          
        float divisor = belowPrivot / pivotValue;

        float value = localBuffer[indexFrom2d(col, rowOuter, widthMat)];

        // Only work items below pivot and from pivot to the right
        if( widthMat > col >= pivotIdx &&
            heightMat > row >= pivotIdx + 1)
        
            localBuffer[indexFrom2d(col, row, widthMat)] = localBuffer[indexFrom2d(col, row, widthMat)] - (value * divisor);

            if(col == pivotIdx)
                localBuffer[indexFrom2d(pivotIdx, row, widthMat)] = divisor;
        
        barrier(CLK_LOCAL_MEM_FENCE);
    

    // Write back to global memory
    U[gidx] = localBuffer[gidx];

【讨论】：

要全局同步（跨所有工作项）使用多个内核。【参考方案2】：

错误太大了，不可能由浮点运算引起。

如果对您的算法没有更深入的了解，我会说问题在于您使用的是目标缓冲区中的值。使用顺序代码这很好，因为您知道那里有什么值。但是对于 OpenCL，内核是并行执行的。所以你无法判断另一个内核是否已经将它的值存储到目标缓冲区。

【讨论】：

以上是关于OpenCL 计算与顺序算法的输出不匹配的主要内容，如果未能解决你的问题，请参考以下文章