SYCL 内核中的分段错误

Posted 2023-03-10

技术标签:

【中文标题】SYCL 内核中的分段错误【英文标题】：Segmentation Fault in a SYCL Kernel 【发布时间】：2021-06-25 08:20:36 【问题描述】：

我一直在尝试在 SYCL 中实现简单的矩阵乘法，但是一旦内核启动，我总是会遇到分段错误。我的代码如下-

class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
typedef cl::sycl::buffer<float, 1> sycl_buffer;


void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
                               queue deviceQueue, int numThreads)

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout<<"Starting Matrix Multiplication"<<std::endl;
    nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(M / numThreads + 1, K / numThreads + 1),
                                           cl::sycl::range<2>(numThreads, numThreads));

    deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh)

        auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
        auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
        auto result_accessor = result.get_access<access::mode::read_write>(cgh);


        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem)

            auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
            auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);

            if(row_index < M && column_index < K)
                float sum = 0.0f;
                for (int i = 0; i < N; i++) 
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
                
                result_accessor[K * row_index + column_index] = sum;
            
        );
    );
    deviceQueue.wait();
    std::cout<<"Done with Matmul"<<std::endl;

 

int main() 

    size_t M  = 512;
    size_t N = 512;
    size_t K = 512;

    auto matA = (float*) malloc(M * N * sizeof(float ));
    auto matB = (float*) malloc(N * K * sizeof(float ));
    auto result =  (float*) malloc(M * K * sizeof(float ));

    for (int i=0; i< M*N; i++)
         matA[i] = 2.0f;
    for (int i=0; i< N*K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M*K; ++i)
        result[i] = 69.0f;

    queue Queue;

    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
    std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;
    auto thread_max  = int(std::sqrt(max_work_group_size));
    std::cout<<thread_max<<std::endl;


    buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
    buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
    buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));

    auto mata_shared = std::make_shared<buffer<float, 1>>(mata_buffer);
    auto matb_shared = std::make_shared<buffer<float, 1>>(matb_buffer);
    auto result_shared = std::make_shared<buffer<float, 1>>(result_buffer);

    naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);

    Queue.submit([result_shared, result](handler& cgh)
       auto resultAccessor = result_shared->get_access<access::mode::read>(cgh);
       cgh.copy(resultAccessor, result);
    );
    Queue.wait();

    std::cout<<"Here";

    for(int i=0; i<100; i++)
        std::cout<<result[i]<<"  ";
    std::cout<<std::endl;

输出如下-

Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
Segmentation fault (core dumped)

我无法弄清楚分段错误的根源。任何帮助表示赞赏。

提前致谢

编辑-传递-g作为编译器标志来获取调试符号，输出如下-

Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Aborted (core dumped)

并在 GDB 下运行它 - 这是输出

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./computecpp_test...
(gdb) r
Starting program: /home/atharva/CLionProjects/computecpp_test/computecpp_test 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7066700 (LWP 18128)]
[New Thread 0x7ffff62e5700 (LWP 18133)]
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'

Thread 1 "computecpp_test" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb)

这是我的 CMake，仅供参考，以便您了解正在传递的编译器标志

cmake_minimum_required(VERSION 3.17)
project(computecpp_test)

set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_CXX_FLAGS -g)

set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
#include(FindComputeCpp)
find_package(ComputeCpp)

include_directories($(COMPUTECPP_INCLUDE_DIRECTORY))

add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)

更新 - 在调试期间，我将所有索引都更改为 0，但仍然抛出分段错误（如果使用 -g 编译器标志，则出现无效对象错误），这让我相信数据访问不是问题但还有别的。

回溯如下-

#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007ffff73c8859 in __GI_abort () at abort.c:79
#2  0x00007ffff779d911 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3  0x00007ffff77a938c in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00007ffff77a93f7 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#5  0x00007ffff77a96a9 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#6  0x00007ffff7c63d61 in void cl::sycl::detail::handle_sycl_log<cl::sycl::invalid_object_error>(std::unique_ptr<cl::sycl::detail::sycl_log, std::default_delete<cl::sycl::detail::sycl_log> >&&) ()
   from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#7  0x00007ffff7c5d0bd in cl::sycl::detail::trigger_sycl_log(cl::sycl::log_type, char const*, int, int, cl::sycl::detail::cpp_error_code, cl::sycl::detail::context const*, char const*) ()
   from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#8  0x000000000040ab25 in cl::sycl::program::create_program_for_kernel<naive_MatMul_kernel> (c=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/program.h:510
#9  0x000000000040552b in cl::sycl::handler::parallel_for_impl<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1>(cl::sycl::detail::nd_range_base const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1 const&, int) (this=0x6b1d40, ndRange=..., functor=..., dimensions=2)
    at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:423
#10 0x0000000000405485 in cl::sycl::handler::parallel_for<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1, 2>(cl::sycl::nd_range<2> const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1 const&) (this=0x6b1d40, ndRange=..., functor=...)
    at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:471
#11 0x000000000040536e in naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const (
    this=0x7fffffffd500, cgh=...) at main.cpp:49
#12 0x000000000040518f in cl::sycl::detail::command_group::submit_handler<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0, std::shared_ptr<cl::sycl::detail::queue> const&, cl::sycl::detail::standard_handler_tag) (this=0x7fffffffd738, cgf=..., fallbackQueue=std::shared_ptr<class cl::sycl::detail::queue> (empty) = ...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/command_group.h:179
#13 0x000000000040391f in cl::sycl::queue::submit<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0) (this=0x7fffffffdaa8, cgf=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/queue.h:519
#14 0x00000000004037bb in naiveMatrixMultiplication (MatA=..., MatB=..., result=..., M=512, N=512, K=512, deviceQueue=..., numThreads=16) at main.cpp:42
#15 0x0000000000404adb in main () at main.cpp:220

基本上它在 program.h 文件中的此代码处停止

      COMPUTECPP_CL_ERROR_CODE_MSG(
          CL_SUCCESS, detail::cpp_error_code::KERNEL_NOT_FOUND_ERROR,
          c.get_impl().get(),
          "Unable to retrieve kernel function, is integration header included?")

显然，它无法检索内核函数。

【问题讨论】：

我建议使用 gdb 调试器运行程序并获取回溯。这应该会给你一些关于分段错误发生在哪里的提示。你能从中添加信息吗？感谢您的回复@RodBurns。我确实在我这边的 gdb 下运行了该程序，但它并没有真正的帮助。我正在使用 Codeplay 的编译器，您能否推荐正确的标志以在 gdb 下运行它，我怀疑这就是为什么 GDB 在第一次尝试中并没有真正帮助的原因我已按照建议添加了 GDB 输出。有趣的是，在将 -g 作为编译器标志传递后，错误从分段错误变为无效对象错误，并且 gdb 表示缺少某些文件可能有助于使用 bt 命令获取回溯。另外你的环境是什么？例如。 Linux 或 Windows。处于调试模式的 Windows 目前使用 ComputeCpp 失败 【参考方案1】：

以下是我在您的代码中发现的一些问题：

以下内容：

 Queue.submit([result_shared, result](handler& cgh)
   auto resultAccessor = result_shared->get_access<access::mode::read(cgh);
   cgh.copy(resultAccessor, result);
 );
 Queue.wait();

没用，因为sycl::buffers 旨在为您进行同步。一旦缓冲区被销毁，您就可以保证将内存复制回主机（否则我相信它处于未定义状态）。

buffer<float, 1>

只写：

buffer<float, 1> mata_buffer(matA, range<1>(M * N));
buffer<float, 1> matb_buffer(matB, range<1>(N * K));
buffer<float, 1> result_buffer(result, range<1>(M * K));

原来您从queue Queue; 获得的默认队列不一定是主机设备。在某些实现中，此行为允许使用环境变量更改您正在运行的设备。在我的实现中，queue Queue; 返回给我一个 GPU，而您的原始代码失败（因为它需要执行上述复制）。但是，当使用queue Queuehost_selector; 在主机设备上运行时，由于我正在运行的 SYCL 实现无法执行从主机到主机的 memcpy，因此无法正常工作。

您正在使用max_work_group_size，就好像您认为它是真正的工作组规模一样。不是，它只是一个提示，实际上可以是从 0 到 2**64-1 的任何值。考虑做一些边界检查。

您混淆了nd_range<2> 中的参数。签名是：

sycl::nd_range<2>(sycl::range<2> globalSize, sycl::range<2> localSize);

globalSize 的每个维度都应该是localSize 中每个维度的倍数。

所以你应该这样做

auto local_range = sycl::range<2>(numThreads, numThreads);
auto global_range = sycl::range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
sycl::nd_range<2> launchParams = nd_range<2>(global_range, local_range);

nd_range 乘法的目的是获得您的设备将处理的“真实”全局范围，因为它可能比您预期的要大一些。

最后的评论：我不太确定你为什么将缓冲区包装在共享指针中。首先，它们不是“重型结构”，它是一个不保存内存的包装器。您可能已经注意到它甚至不需要设备。此外，从不同的地方访问单个缓冲区（我猜是共享指针的目的）可能会导致 UB。

最后你不需要手动计算偏移量，你可以直接使用

row_index = ndItem.get_global_id(0);

根据这些建议，您的代码是：


void naiveMatrixMultiplication(float* MatA, float* MatB, float* result, size_t M, size_t N, size_t K, queue deviceQueue, size_t numThreads) 

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout << "Starting Matrix Multiplication" << std::endl;

    buffer<float, 1> mata_buffer(MatA, range<1>(M * N));
    buffer<float, 1> matb_buffer(MatB, range<1>(N * K));
    buffer<float, 1> result_buffer(result, range<1>(M * K));

    auto local_range = range<2>(numThreads, numThreads);
    auto global_range = range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
    auto launchParams = nd_range<2>(global_range, local_range);

    deviceQueue.submit([&, M, N, K, launchParams](handler &cgh) 
        auto MatA_accessor = mata_buffer.get_access<access::mode::read>(cgh);
        auto MatB_accessor = matb_buffer.get_access<access::mode::read>(cgh);
        auto result_accessor = result_buffer.get_access<access::mode::write>(cgh);
        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem) 

            auto column_index = ndItem.get_global_id(1);
            auto row_index = ndItem.get_global_id(0);

            if (row_index < M && column_index < K) 
                float sum = 0.0f;
                for (int i = 0; i < N; i++) 
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[i * N + column_index];
                
                result_accessor[K * row_index + column_index] = sum;
            
        );
    );
    deviceQueue.wait();
    std::cout << "Done with Matmul" << std::endl;



int main() 
    size_t M = 512;
    size_t N = 512;
    size_t K = 512;
    auto matA = (float *) malloc(M * N * sizeof(float));
    auto matB = (float *) malloc(N * K * sizeof(float));
    auto result = (float *) malloc(M * K * sizeof(float));

    for (int i = 0; i < M * N; i++)
        matA[i] = 2.0f;
    for (int i = 0; i < N * K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M * K; ++i)
        result[i] = 69.0f;

    queue Queuegpu_selector;

    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<info::device::max_work_group_size>();
    std::cout << device.get_info<info::device::name>() << std::endl;
    auto thread_max = std::sqrt(max_work_group_size);
    std::cout << thread_max << std::endl;

    naiveMatrixMultiplication(matA, matB, result, M, N, K, Queue, thread_max);
    std::cout << "Here";

    for (int i = 0; i < 100; i++)
        std::cout << result[i] << "  ";
    std::cout << std::endl;

编辑：我要补充一点，computecpp-sdk 存储库中有一个用 SYCL 编写的 matrix multiplication sample（以获得更多灵感）。

【讨论】：

以上是关于SYCL 内核中的分段错误的主要内容，如果未能解决你的问题，请参考以下文章