在 SYCL 中实现矩阵加法和乘法

Posted 2023-03-10

技术标签:

【中文标题】在 SYCL 中实现矩阵加法和乘法【英文标题】：Implementing Matrix Addition and Multiplication in SYCL 【发布时间】：2019-09-10 11:08:14 【问题描述】：

我正在尝试在单个程序中在 sycl 中实现矩阵加法和乘法，但在加法部分出现错误[对于类型 'const] 没有可行的重载运算符 []。我不知道错误的原因。如果有人告诉我原因，那将是很大的帮助。谢谢

我已经分别实现了加法和乘法，它可以工作。我认为原因是我使用模板进行乘法运算，这给加法部分带来了问题

#include <CL/sycl.hpp>
#include <chrono>
#include <cmath>
#include <ctime>
#include <iostream>

using namespace cl::sycl;

class mxm_kernel;

void display_matrix(float* m, int matSize) 
if (matSize > 16) 
    return;


std::cout << "=======" << std::endl;
for (int i = 0; i < matSize; i++) 
    for (int j = 0; j < matSize; j++) 
        std::cout << m[i * matSize + j] << " ";
    
    std::cout << std::endl;

std::cout << "=======" << std::endl;
;


inline int prevPowerOfTwo(int x) 
if (x < 0) 
    return 0;

--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x - (x >> 1);


inline bool isPowerOfTwo(int x) 
return (x & (x - 1)) == 0;


template <typename T>
bool local_mxm(cl::sycl::queue& q, T* MA, T* MB, T* MC, T* MD, int matSize) 
// Make sure it is power of two before running
if (!isPowerOfTwo(matSize)) 
    std::cout << " This example only works with power of two sizes "
        << std::endl;
    return true;


auto device = q.get_device();
auto maxBlockSize =
    device.get_info<cl::sycl::info::device::max_work_group_size>();
auto blockSize = prevPowerOfTwo(std::sqrt(maxBlockSize));
std::cout << " The Device Max Work Group Size is : " << maxBlockSize
    << std::endl;
std::cout << " The order is : " << matSize << std::endl;
std::cout << " The blockSize is : " << blockSize << std::endl;
// Make sure the block size is not larger than the mat size
blockSize = std::min(matSize, blockSize);


    range<1> dimensions(matSize * matSize);
    const property_list props =  property::buffer::use_host_ptr() ;
    buffer<T> bA(MA, dimensions, props);
    buffer<T> bB(MB, dimensions, props);
    buffer<T> bC(MC, dimensions, props);
    buffer<T> bD(MD, dimensions, props);

    q.submit([&](handler& cgh) 
        auto pA = bA.template get_access<access::mode::read>(cgh);
        auto pB = bB.template get_access<access::mode::read>(cgh);
        auto pC = bC.template get_access<access::mode::write>(cgh);
        auto pD = bD.template get_access<access::mode::write>(cgh);
        auto localRange = range<1>(blockSize * blockSize);

        accessor<T, 1, access::mode::read_write, access::target::local> pBA(
            localRange, cgh);
        accessor<T, 1, access::mode::read_write, access::target::local> pBB(
            localRange, cgh);

        cgh.parallel_for<class matrix_add>(range<2> matSize, matSize, [=](id<2> it) 
            pD[it] = pA[it] + pB[it];
        );

        cgh.parallel_for<mxm_kernel>(
            nd_range<2>range<2>(matSize, matSize),
            range<2>(blockSize, blockSize),
            [=](nd_item<2> it) 
            // Current block
            int blockX = it.get_group(0);
            int blockY = it.get_group(1);

            // Current local item
            int localX = it.get_local_id(0);
            int localY = it.get_local_id(1);

            // Start in the A matrix
            int a_start = matSize * blockSize * blockY;
            // End in the b matrix
            int a_end = a_start + matSize - 1;
            // Start in the b matrix
            int b_start = blockSize * blockX;

            // Result for the current C(i,j) element
            T tmp = 0.0f;
            // We go through all a, b blocks
            for (int a = a_start, b = b_start; a <= a_end;
                a += blockSize, b += (blockSize * matSize)) 
                // Copy the values in shared memory collectively
                pBA[localY * blockSize + localX] =
                    pA[a + matSize * localY + localX];
                // Note the swap of X/Y to maintain contiguous access
                pBB[localX * blockSize + localY] =
                    pB[b + matSize * localY + localX];
                it.barrier(access::fence_space::local_space);
                // Now each thread adds the value of its sum
                for (int k = 0; k < blockSize; k++) 
                    tmp +=
                        pBA[localY * blockSize + k] * pBB[localX * blockSize + k];
                
                // The barrier ensures that all threads have written to local
                // memory before continuing
                it.barrier(access::fence_space::local_space);
            
            auto elemIndex = it.get_global_id(1) * it.get_global_range()[0] +
                it.get_global_id(0);
            // Each thread updates its position
            pC[elemIndex] = tmp;
        );
    );

return false;


int main(int argc, char* argv[]) 
float* MA;
float* MB;
float* MC;
float* MD;
bool sycl = true;
bool error = false;

int matSize = 4;
MA = new float[matSize * matSize];
MB = new float[matSize * matSize];
MC = new float[matSize * matSize];
MD = new float[matSize * matSize];



std::cout << " Input matrix " << std::endl;
display_matrix(MA, matSize);
display_matrix(MB, matSize);
display_matrix(MC, matSize);
display_matrix(MD, matSize);



if (sycl) 
    std::cout << " ***** SYCL " << std::endl;
    // MatrixC initialization
    std::cout << "MATRIX D" << std::endl;
    for (int i = 0; i < matSize; i++) 
        for (int j = 0; j < matSize; j++) 
            MD[i * matSize + j] = 0.0f;  // i * matSize + j;
            std::cout << MD[i * matSize + j] << " ";
        
        std::cout << std::endl;
    
    std::cout << "=======" << std::endl;
    // MatrixC initialization
    std::cout << "MATRIX C" << std::endl;
    for (int i = 0; i < matSize; i++) 
        for (int j = 0; j < matSize; j++) 
            MC[i * matSize + j] = 0.0f;  // i * matSize + j;
            std::cout << MC[i * matSize + j] << " ";
        
        std::cout << std::endl;
    
    std::cout << "=======" << std::endl;
    // MatrixA initialization
    std::cout << "MATRIX A" << std::endl;
    for (int i = 0; i < matSize; i++) 
        for (int j = 0; j < matSize; j++) 
            MA[i * matSize + j] = 0.0f + j;  // i * matSize + j;
            std::cout << MA[i * matSize + j] << " ";
        
        std::cout << std::endl;
    
    std::cout << "=======" << std::endl;
    // MatrixB initialization
    std::cout << "MATRIX B" << std::endl;
    for (int i = 0; i < matSize; i++) 
        for (int j = 0; j < matSize; j++) 
            MB[i * matSize + j] = 0.0f + j;  // i * matSize + j;
            std::cout << MB[i * matSize + j] << " ";
        
        std::cout << std::endl;
    
    std::cout << "=======" << std::endl;
    
        
            cpu_selector device_selector;
            queue q(device_selector);


            auto start = std::chrono::steady_clock::now();
            error = local_mxm(q, MA, MB, MC, MD, matSize);
            q.wait_and_throw();
            auto end = std::chrono::steady_clock::now();
            auto time =
                std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
                .count();
            std::cout << "SYCL: ";
            std::cout << "Time: " << time << std::endl;
            float flops =
                (2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
            std::cout << "GFLOPs: " << flops << std::endl;
            std::cout << " Output " << std::endl;
        
        display_matrix(MC, matSize);
        display_matrix(MD, matSize);
    


delete[] MA;
delete[] MB;
delete[] MC;

return error ? 1 : 0;

【问题讨论】：

【参考方案1】：

这里的问题是，您在“组合”函数中的 parallel_for 调用都在单个命令组提交中，这是 SYCL 不允许的。一个命令组 (queue::submit(command_group_handler)) 范围仅处理其主体中的一个内核。因此，您必须将您的parallel_fors 拆分为两个单独的队列提交。为了确保我所说的足够清楚并符合您的意图，是的，您可以将它们保持在相同的函数范围内，即：

template <typename T>
bool local_mxm(cl::sycl::queue& q, T* MA, T* MB, T* MC, T* MD, int matSize) 
  ...
  // Command group submission for the matrix addition kernel
  ...
  // Command group submission for the matrix multiplication kernel
  ...

矩阵加法 矩阵加法内核的命令组提交。以下是您提交的matrix_add 的外观：

q.submit([&](handler& cgh) 
  auto pA = bA.template get_access<access::mode::read>(cgh);
  auto pB = bB.template get_access<access::mode::read>(cgh);
  // You don't need accessor (pC) to the buffer (bC) anymore
  auto pD = bD.template get_access<access::mode::write>(cgh);

  cgh.parallel_for<matrix_add>(
      range<1>static_cast<size_t>(matSize * matSize),
      [=](id<1> it)  pD[it] = pA[it] + pB[it]; );
);

现在，请注意当作为模板参数传递给parallel_for 时，我是如何删除matrix_add 之前的class 关键字的。这是因为您（从 SYCL 1.2.1 开始）需要转发声明您的内核类或将它们定义为函数对象（如果您愿意的话），但您不能随时派生它们。编译器还需要内核名称是唯一的，因此如果您要在程序中多次调用这些内核，建议您使用后者。

接下来是混乱的范围或索引空间（在 OpenCL 中，NDRange 指的是数据并行应用程序的输入数据的索引空间）。

在矩阵乘法代码中（应该来自我们在computecpp-sdk 中的样本）索引空间是nd_range<2> 和nd_item<2>（有关工作项/组ID 和大小的信息）的原因是这样可以实现块（或平铺）矩阵乘法算法，但所有访问器都是一维的。正确的位置是通过一些棘手的指针算法得出的，但无论如何，您绝不可以执行以下操作：

// !kinda pseudo-code, so don't take it literally
accessor<T, 1> acc(...);
parallel_for<kernel>(range<2>size,size, [=](id<2> i) 
  acc[i] = i;

你通常会做的事情如下：

// !kinda pseudo-code, so don't take it literally
accessor<T, 1> acc(...);
parallel_for<kernel>(range<1>size * size, [=](id<1> i) 
  acc[i] = i;

这是因为您想在矩阵加法的情况下直接访问内部指针的线性 id。如果你的输入数组是二维的，你会做相反的事情：range<2>size,size, [=](id<2> i)

矩阵乘法 矩阵乘法内核的命令组提交与原始代码中的相同。

q.submit([&](handler& cgh) 
  auto pA = bA.template get_access<access::mode::read>(cgh);
  auto pB = bB.template get_access<access::mode::read>(cgh);
  auto pC = bC.template get_access<access::mode::write>(cgh);
  auto pD = bD.template get_access<access::mode::write>(cgh);
  auto localRange = range<1>(blockSize * blockSize);

  accessor<T, 1, access::mode::read_write, access::target::local> pBA(
      localRange, cgh);
  accessor<T, 1, access::mode::read_write, access::target::local> pBB(
      localRange, cgh);

 cgh.parallel_for<mxm_kernel>(
      nd_range<2>range<2>(static_cast<size_t>(matSize),
                           static_cast<size_t>(matSize)),
                  range<2>(blockSize, blockSize), 
      [=](nd_item<2> it) 
        ... 
      );
);

尝试阅读阻塞的矩阵乘法代码，也许可以查找一些解释该算法的资源。这是一个相当棘手的问题，但我可以想象有一些不错的解释。（可能在 CUDA 学习资料中）。

希望这会有所帮助！

【讨论】：

非常感谢您如此详细地解释它，这真的很有帮助。我发现了对两者都使用单个命令组的问题并对其进行了纠正，但是我弄乱了它们的范围并错过了您提到的课程点。我会做出这些改变。谢谢您可以使用分层并行，而不是使用屏障等低级结构。 @RonanKeryell 在这里是正确的。分层并行在 SYCL 中是可能的，它提供了更清晰、更容易理解内核结构的方式。诸如工作组障碍之类的同步原语将为您隐式放置。

以上是关于在 SYCL 中实现矩阵加法和乘法的主要内容，如果未能解决你的问题，请参考以下文章