SYCL 内核中的分段错误
Posted
技术标签:
【中文标题】SYCL 内核中的分段错误【英文标题】:Segmentation Fault in a SYCL Kernel 【发布时间】:2021-06-25 08:20:36 【问题描述】:我一直在尝试在 SYCL 中实现简单的矩阵乘法,但是一旦内核启动,我总是会遇到分段错误。我的代码如下-
class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
typedef cl::sycl::buffer<float, 1> sycl_buffer;
void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
queue deviceQueue, int numThreads)
/*
* Naive Matrix Multiplication of MxN and NxK
* */
std::cout<<"Starting Matrix Multiplication"<<std::endl;
nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(M / numThreads + 1, K / numThreads + 1),
cl::sycl::range<2>(numThreads, numThreads));
deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh)
auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
auto result_accessor = result.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
(nd_item<2> ndItem)
auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);
if(row_index < M && column_index < K)
float sum = 0.0f;
for (int i = 0; i < N; i++)
sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
result_accessor[K * row_index + column_index] = sum;
);
);
deviceQueue.wait();
std::cout<<"Done with Matmul"<<std::endl;
int main()
size_t M = 512;
size_t N = 512;
size_t K = 512;
auto matA = (float*) malloc(M * N * sizeof(float ));
auto matB = (float*) malloc(N * K * sizeof(float ));
auto result = (float*) malloc(M * K * sizeof(float ));
for (int i=0; i< M*N; i++)
matA[i] = 2.0f;
for (int i=0; i< N*K; i++)
matB[i] = 2.0f;
for (int i = 0; i < M*K; ++i)
result[i] = 69.0f;
queue Queue;
auto device = Queue.get_device();
auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;
auto thread_max = int(std::sqrt(max_work_group_size));
std::cout<<thread_max<<std::endl;
buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));
auto mata_shared = std::make_shared<buffer<float, 1>>(mata_buffer);
auto matb_shared = std::make_shared<buffer<float, 1>>(matb_buffer);
auto result_shared = std::make_shared<buffer<float, 1>>(result_buffer);
naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);
Queue.submit([result_shared, result](handler& cgh)
auto resultAccessor = result_shared->get_access<access::mode::read>(cgh);
cgh.copy(resultAccessor, result);
);
Queue.wait();
std::cout<<"Here";
for(int i=0; i<100; i++)
std::cout<<result[i]<<" ";
std::cout<<std::endl;
输出如下-
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
Segmentation fault (core dumped)
我无法弄清楚分段错误的根源。任何帮助表示赞赏。
提前致谢
编辑-传递-g
作为编译器标志来获取调试符号,输出如下-
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Aborted (core dumped)
并在 GDB 下运行它 - 这是输出
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./computecpp_test...
(gdb) r
Starting program: /home/atharva/CLionProjects/computecpp_test/computecpp_test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7066700 (LWP 18128)]
[New Thread 0x7ffff62e5700 (LWP 18133)]
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Thread 1 "computecpp_test" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb)
这是我的 CMake,仅供参考,以便您了解正在传递的编译器标志
cmake_minimum_required(VERSION 3.17)
project(computecpp_test)
set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_CXX_FLAGS -g)
set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
#include(FindComputeCpp)
find_package(ComputeCpp)
include_directories($(COMPUTECPP_INCLUDE_DIRECTORY))
add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)
更新 - 在调试期间,我将所有索引都更改为 0,但仍然抛出分段错误(如果使用 -g 编译器标志,则出现无效对象错误),这让我相信数据访问不是问题但还有别的。
回溯如下-
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007ffff73c8859 in __GI_abort () at abort.c:79
#2 0x00007ffff779d911 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3 0x00007ffff77a938c in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#4 0x00007ffff77a93f7 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#5 0x00007ffff77a96a9 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#6 0x00007ffff7c63d61 in void cl::sycl::detail::handle_sycl_log<cl::sycl::invalid_object_error>(std::unique_ptr<cl::sycl::detail::sycl_log, std::default_delete<cl::sycl::detail::sycl_log> >&&) ()
from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#7 0x00007ffff7c5d0bd in cl::sycl::detail::trigger_sycl_log(cl::sycl::log_type, char const*, int, int, cl::sycl::detail::cpp_error_code, cl::sycl::detail::context const*, char const*) ()
from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#8 0x000000000040ab25 in cl::sycl::program::create_program_for_kernel<naive_MatMul_kernel> (c=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/program.h:510
#9 0x000000000040552b in cl::sycl::handler::parallel_for_impl<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1>(cl::sycl::detail::nd_range_base const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1 const&, int) (this=0x6b1d40, ndRange=..., functor=..., dimensions=2)
at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:423
#10 0x0000000000405485 in cl::sycl::handler::parallel_for<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1, 2>(cl::sycl::nd_range<2> const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::lambda(cl::sycl::nd_item<2>)#1 const&) (this=0x6b1d40, ndRange=..., functor=...)
at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:471
#11 0x000000000040536e in naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const (
this=0x7fffffffd500, cgh=...) at main.cpp:49
#12 0x000000000040518f in cl::sycl::detail::command_group::submit_handler<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0, std::shared_ptr<cl::sycl::detail::queue> const&, cl::sycl::detail::standard_handler_tag) (this=0x7fffffffd738, cgf=..., fallbackQueue=std::shared_ptr<class cl::sycl::detail::queue> (empty) = ...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/command_group.h:179
#13 0x000000000040391f in cl::sycl::queue::submit<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0) (this=0x7fffffffdaa8, cgf=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/queue.h:519
#14 0x00000000004037bb in naiveMatrixMultiplication (MatA=..., MatB=..., result=..., M=512, N=512, K=512, deviceQueue=..., numThreads=16) at main.cpp:42
#15 0x0000000000404adb in main () at main.cpp:220
基本上它在 program.h
文件中的此代码处停止
COMPUTECPP_CL_ERROR_CODE_MSG(
CL_SUCCESS, detail::cpp_error_code::KERNEL_NOT_FOUND_ERROR,
c.get_impl().get(),
"Unable to retrieve kernel function, is integration header included?")
显然,它无法检索内核函数。
【问题讨论】:
我建议使用 gdb 调试器运行程序并获取回溯。这应该会给你一些关于分段错误发生在哪里的提示。你能从中添加信息吗? 感谢您的回复@RodBurns。我确实在我这边的 gdb 下运行了该程序,但它并没有真正的帮助。我正在使用 Codeplay 的编译器,您能否推荐正确的标志以在 gdb 下运行它,我怀疑这就是为什么 GDB 在第一次尝试中并没有真正帮助的原因 我已按照建议添加了 GDB 输出。有趣的是,在将 -g 作为编译器标志传递后,错误从分段错误变为无效对象错误,并且 gdb 表示缺少某些文件 可能有助于使用 bt 命令获取回溯。 另外你的环境是什么?例如。 Linux 或 Windows。处于调试模式的 Windows 目前使用 ComputeCpp 失败 【参考方案1】:以下是我在您的代码中发现的一些问题:
-
以下内容:
Queue.submit([result_shared, result](handler& cgh)
auto resultAccessor = result_shared->get_access<access::mode::read(cgh);
cgh.copy(resultAccessor, result);
);
Queue.wait();
没用,因为sycl::buffer
s 旨在为您进行同步。一旦缓冲区被销毁,您就可以保证将内存复制回主机(否则我相信它处于未定义状态)。
-
您已将缓冲区声明为
buffer<float, 1>
,这意味着您的 SYCL 缓冲区包含基础数据的类型。构建缓冲区时,您只需要传递元素的数量而不是其大小(以字节为单位)。这就是您的代码在提交内核时崩溃的原因(这是设备发生隐式复制的地方)。
只写:
buffer<float, 1> mata_buffer(matA, range<1>(M * N));
buffer<float, 1> matb_buffer(matB, range<1>(N * K));
buffer<float, 1> result_buffer(result, range<1>(M * K));
原来您从queue Queue;
获得的默认队列不一定是主机设备。在某些实现中,此行为允许使用环境变量更改您正在运行的设备。在我的实现中,queue Queue;
返回给我一个 GPU,而您的原始代码失败(因为它需要执行上述复制)。但是,当使用queue Queuehost_selector;
在主机设备上运行时,由于我正在运行的 SYCL 实现无法执行从主机到主机的 memcpy,因此无法正常工作。
您正在使用max_work_group_size
,就好像您认为它是真正的工作组规模一样。不是,它只是一个提示,实际上可以是从 0 到 2**64-1 的任何值。考虑做一些边界检查。
您混淆了nd_range<2>
中的参数。签名是:
sycl::nd_range<2>(sycl::range<2> globalSize, sycl::range<2> localSize);
globalSize
的每个维度都应该是localSize
中每个维度的倍数。
所以你应该这样做
auto local_range = sycl::range<2>(numThreads, numThreads);
auto global_range = sycl::range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
sycl::nd_range<2> launchParams = nd_range<2>(global_range, local_range);
nd_range
乘法的目的是获得您的设备将处理的“真实”全局范围,因为它可能比您预期的要大一些。
最后的评论:我不太确定你为什么将缓冲区包装在共享指针中。首先,它们不是“重型结构”,它是一个不保存内存的包装器。您可能已经注意到它甚至不需要设备。此外,从不同的地方访问单个缓冲区(我猜是共享指针的目的)可能会导致 UB。
最后你不需要手动计算偏移量,你可以直接使用
row_index = ndItem.get_global_id(0);
根据这些建议,您的代码是:
void naiveMatrixMultiplication(float* MatA, float* MatB, float* result, size_t M, size_t N, size_t K, queue deviceQueue, size_t numThreads)
/*
* Naive Matrix Multiplication of MxN and NxK
* */
std::cout << "Starting Matrix Multiplication" << std::endl;
buffer<float, 1> mata_buffer(MatA, range<1>(M * N));
buffer<float, 1> matb_buffer(MatB, range<1>(N * K));
buffer<float, 1> result_buffer(result, range<1>(M * K));
auto local_range = range<2>(numThreads, numThreads);
auto global_range = range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
auto launchParams = nd_range<2>(global_range, local_range);
deviceQueue.submit([&, M, N, K, launchParams](handler &cgh)
auto MatA_accessor = mata_buffer.get_access<access::mode::read>(cgh);
auto MatB_accessor = matb_buffer.get_access<access::mode::read>(cgh);
auto result_accessor = result_buffer.get_access<access::mode::write>(cgh);
cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
(nd_item<2> ndItem)
auto column_index = ndItem.get_global_id(1);
auto row_index = ndItem.get_global_id(0);
if (row_index < M && column_index < K)
float sum = 0.0f;
for (int i = 0; i < N; i++)
sum += MatA_accessor[N * row_index + i] * MatB_accessor[i * N + column_index];
result_accessor[K * row_index + column_index] = sum;
);
);
deviceQueue.wait();
std::cout << "Done with Matmul" << std::endl;
int main()
size_t M = 512;
size_t N = 512;
size_t K = 512;
auto matA = (float *) malloc(M * N * sizeof(float));
auto matB = (float *) malloc(N * K * sizeof(float));
auto result = (float *) malloc(M * K * sizeof(float));
for (int i = 0; i < M * N; i++)
matA[i] = 2.0f;
for (int i = 0; i < N * K; i++)
matB[i] = 2.0f;
for (int i = 0; i < M * K; ++i)
result[i] = 69.0f;
queue Queuegpu_selector;
auto device = Queue.get_device();
auto max_work_group_size = device.get_info<info::device::max_work_group_size>();
std::cout << device.get_info<info::device::name>() << std::endl;
auto thread_max = std::sqrt(max_work_group_size);
std::cout << thread_max << std::endl;
naiveMatrixMultiplication(matA, matB, result, M, N, K, Queue, thread_max);
std::cout << "Here";
for (int i = 0; i < 100; i++)
std::cout << result[i] << " ";
std::cout << std::endl;
编辑:我要补充一点,computecpp-sdk 存储库中有一个用 SYCL 编写的 matrix multiplication sample(以获得更多灵感)。
【讨论】:
以上是关于SYCL 内核中的分段错误的主要内容,如果未能解决你的问题,请参考以下文章