Sycl 部分+ DPCPP 中的互相关和错误
Posted
技术标签:
【中文标题】Sycl 部分+ DPCPP 中的互相关和错误【英文标题】:Cross Correlation and Errors in Sycl part+ DPCPP 【发布时间】:2021-07-07 21:22:05 【问题描述】:我尝试编写互相关函数。 在我的程序中,我编写了一个 Map 框架,它通过一些指定目标类型(CPU 或 GPU/加速器)的参数来包装 OneAPI 调用隐藏硬件定位问题。 问题是,在 Sycl 部分,程序出现了一些错误,我无法解决它们。 我的代码:
<!-- language: c++ -->
//Definition of function which apply filter on matrices
template<class T>
T applyFilter(std::vector<std::vector<T>> f, std::vector<std::vector<T>> g)
int n_rows = f.size();
int n_cols = f[0].size();
double sum = 0;
for (int i = 0; i < n_rows; i++)
for (int j = 0; j < n_cols; j++)
sum += f[i][j] * g[i][j];
return sum;
;
//function which print a specific part of my matrix
template<class T>
void print_matrix(std::vector<std::vector<T>> matrix)
int m = matrix.size();
int n = matrix[0].size();
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
std::cout << matrix[i][j] << ' ';
std::cout << "\n";
//Function which Slice a specific part of my matricx
template<class T>
std::vector<std::vector<T>> slice_matrix(std::vector<std::vector<T>> mat, int i,
int j, int r, int c)
std::vector<std::vector<T>> out(r, std::vector<T>(c, 0));
for (int k = 0; k < r; k++)
std::vector<T> temp(mat[i + k].begin() + j, mat[i + k].begin() + j + c);
out[k] = temp;
return out;
//Start to produce for my Matrix random numbers
template<class T>
void rand_fill_row(std::vector<T> &row)
std::generate(row.begin(), row.end(), []()
return rand() % 100;
);
//A function that for each cell of my matrix execute to fill it with random numbers
template<class T>
void rand_fill_matrix(std::vector<std::vector<T>> &mat)
for_each(mat.begin(), mat.end(), rand_fill_row<T>);
//Definition of Map Skeleton
template<class Tin, class Tout, class Function>
class Map
private:
Function fun;
public:
Map()
Map(Function f) :
fun(f)
//Overriding () operator
std::vector<std::vector<Tout>> operator()(bool use_tbb,
std::vector<std::vector<Tin>> &img,
std::vector<std::vector<Tin>> &ker)
int img_row = img.size();
int img_col = img[0].size();
int filt_row = ker.size();
int filt_col = ker[0].size();
int out_row = img_row - filt_row;
int out_col = img_col - filt_col;
std::vector<std::vector<Tout>> out;
if (use_tbb)
uTimer *timer = new uTimer("Executing Code On CPU");
tbb::parallel_for(
tbb::blocked_range2d<int, int>(0, out_row, 0, out_col),
[&](tbb::blocked_range2d<int, int> &t)
for (int n = t.rows().begin(); n < t.rows().end();
++n)
for (int m = t.cols().begin(); m < t.cols().end();
++m)
out[n][m] = fun(
slice_matrix(img, n, m, filt_row,
filt_col), ker);
);
timer->~uTimer();
return out;
else
/* A 2D std::vector<std::vector<T>>
* does not have elements stored contiguously in the memory.
* Thus I define a vector<T> and operate on them as contiguous blocks.*/
//Define Buffer for
sycl::buffer<Tin, 1> img_buffer(img.data(), img.size());
sycl::buffer<Tin, 1> ker_buffer(ker.data(), ker.size());
sycl::buffer<Tin, 2> out_buffer(out.data(), out_row, out_col );
//Profiling GPU
// Initialize property list with profiling information
sycl::property_list propList
sycl::property::queue::enable_profiling() ;
// Build the command queue (constructed to handle event profling)
sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),
propList);
// print out the device information used for the kernel code
std::cout << "Device: "
<< gpuQueue.get_device().get_info<sycl::info::device::name>()
<< std::endl;
std::cout << "Compute Units: "
<< gpuQueue.get_device().get_info<
sycl::info::device::max_compute_units>()
<< std::endl;
auto start_overall = std::chrono::system_clock::now();
auto event = gpuQueue.submit(
[&](sycl::handler &h)
//local copy of fun
auto f = fun;
sycl::accessor img_accessor(img_buffer, h,
sycl::read_only);
sycl::accessor ker_accessor(ker_buffer, h,
sycl::read_only);
sycl::accessor out_accessor(out_buffer, h,
sycl::write_only);
h.parallel_for(sycl::range<2> out_row, out_col ,
[=](sycl::id<2> index)
int row = index[0];
int col = index[1];
out_accessor[row][col] = f(slice_matrix(img_accessor, row, col,filt_row, filt_col)
, ker_accessor);
);
);
event.wait();
auto end_overall = std::chrono::system_clock::now();
cl_ulong submit_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_submit>();
cl_ulong start_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_start>();
cl_ulong end_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_end>();
auto submission_time = (start_time - submit_time) / 1000000.0f;
std::cout << "Submit Time: " << submission_time << " ms"
<< std::endl;
auto execution_time = (end_time - start_time) / 1000000.0f;
std::cout << "Execution Time: " << execution_time << " ms"
<< std::endl;
auto execution_overall = std::chrono::duration_cast<
std::chrono::milliseconds>(end_overall - start_overall);
std::cout << "Overall Execution Time: " << execution_overall.count()
<< " ms" << std::endl;
;
return out;
;
//The main part
template<class Tin, class Tout, class Function>
Map<Tin, Tout, Function> make_map(Function f)
return Map<Tin, Tout, Function>(f);
int main(int argc, char *argv[])
std::cout << "The Exutable File! " << argv[0] << std::endl;
std::cout << "The Device Is! " << argv[1] << std::endl;
std::cout << "The Fist Vector Size! " << argv[2] << std::endl;
std::cout << "The Second Vector Size! " << argv[3] << std::endl;
//The Device
std::string device = argv[1];
// Image's row count
int m = std::stoi(argv[2]);
// Image's col count
int n = std::stoi(argv[3]);
std::vector<std::vector<double>> img(m, std::vector<double>(n, 0));
// Filter's row count
int k = std::stoi(argv[4]);
// Filter's row count
int l = std::stoi(argv[5]);
std::vector<std::vector<double>> ker(k, std::vector<double>(l, 0));
//std::vector<std::vector<T>> out(r, std::vector<T>(c, 0));
rand_fill_matrix(img);
rand_fill_matrix(ker);
/*Error is : no matching function for call to 'make_map'*/
<!-- language: lang-js -->
auto m1 = make_map<double, double>(applyFilter);
<!-- language: lang-js -->
std::vector<std::vector<double>> r = m1(true, img, ker);
//print the result
//for (auto &e : r)
//std::cout << e << " ";
//
return 0;
错误是:
'sycl::buffer 的初始化没有匹配的构造函数
//Define Buffer for
sycl::buffer<Tin, 1> img_buffer(&img[0], img.size());
sycl::buffer<Tin, 1> ker_buffer(&ker[0], ker.size());
sycl::buffer<Tin, 2> out_buffer(out.data(), sycl::range<2> out_row, out_col );
================================================ =======
non-constant-expression cannot be narrowed from type 'int' to 'size_t' (aka 'unsigned long') in initializer list [-Wc++11-narrowing]
h.parallel_for(sycl::range<2> out_row, out_col ,
[=](sycl::id<2> index)
int row = index[0];
int col = index[1];
===============================================
Invalid arguments '
Candidates are:
std::vector<std::vector<#0,std::allocator<#0>>,std::allocator<std::vector<#0,std::allocator<#0>>>> slice_matrix(std::vector<std::vector<#0,std::allocator<#0>>,std::allocator<std::vector<#0,std::allocator<#0>>>>, int, int, int, int)
'
out_accessor[row][col] = f(slice_matrix(img_accessor, row, col,
filt_row, filt_col),
ker_accessor); ); );
================================================ ====
no matching function for call to 'make_map'
auto m1 = make_map<double, double>(applyFilter);
【问题讨论】:
这可能是无关的,但你用uTimer
做的事情真的很奇怪。为什么包括uTimer.cpp
而不是uTimer.hpp
?为什么在uTimer *timer = new uTimer("Executing Code On CPU");
中使用手动内存管理?为什么要在timer->~uTimer();
中手动调用没有delete
-ing timer
的析构函数?自动存储(例如uTimer timer = "Executing Code On CPU";
)就足够了,不需要清理。
您介意展示或突出显示编译器错误来自哪些行吗?
为了增加获得问题答案的可能性,我建议提供最少的代码集来演示您遇到的问题。您只是将所有代码复制并粘贴到问题中,而没有突出显示错误被标记的位置。我给出这个建议是为了让您了解什么可以帮助您获得更好的答案。为此,如果可以,请发布您可以编译以显示问题的最少代码量。
关于 uTimer 这是我的错误。谢谢指出
关于手动内存管理,我不是很了解。如果可以的话,请与我分享您的信息
【参考方案1】:
通过匹配类型修复第一个错误 - 只需将 img_row、img_col、filt_row、filt_col、out)row 和 out_col 的声明更改为 size_t 而不是 int。
对于第二个错误 - 编译器是否也发出了有关问题的提示?我不得不根据你的片段做出一些假设,但我最终得到了:
错误:没有调用“make_map”的匹配函数注意:候选 模板被忽略:无法推断模板参数“功能”
这告诉我,我们需要添加的不仅仅是 Tin 和 Tout (
自动 m1 = make_map
,std::vectorstd::vector )>(applyFilter);
但这在我对您的代码的模拟中并不完全正确。 你应该尝试类似的方法。
如果您仍有问题 - 请提供完整的代码示例,我们可以尝试编译。 如果你修复它 - 请将你发现的内容发回这里,以便我们一起学习。
【讨论】:
以上是关于Sycl 部分+ DPCPP 中的互相关和错误的主要内容,如果未能解决你的问题,请参考以下文章