带有 cuBLAS 的 cudaMallocManaged(统一内存)
Posted
技术标签:
【中文标题】带有 cuBLAS 的 cudaMallocManaged(统一内存)【英文标题】:cudaMallocManaged (unified memory) with cuBLAS 【发布时间】:2021-04-06 15:17:27 【问题描述】:我正在尝试将统一内存与 cudaMallocManaged() 与 cuBLAS 库一起使用。我正在执行一个简单的矩阵到向量乘法作为一个简单的示例,并将结果存储在数组results
中。然而,当打印results
数组时,我得到了全0,而不是矩阵mat
乘以向量vec
的结果。
我正在使用的流程是:
-
使用 cudaMallocManaged() 分配内存
用数据初始化数组
分配 cuBLAS 句柄
调用 cublasDgemv 执行将结果存储在
results
中的乘法运算
当使用new
然后cublasSetMatrix()
或cublasSetVector()
这工作正常。
如何在 cuBLAS 中使用统一内存?
以下是最低限度的工作示例:
统一内存尝试(这会返回 results
中的所有 0):
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) gpuAssert((ans), __FILE__, __LINE__);
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
static const char *cublasErrChk(cublasStatus_t error)
switch (error)
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
return "<unknown>";
int main()
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++)
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++)
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
printf("\n");
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
double alpha = 1.f, beta = 1.f;
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
常规 malloc/setMatrix() 尝试:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) gpuAssert((ans), __FILE__, __LINE__);
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
static const char *cublasErrChk(cublasStatus_t error)
switch (error)
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
return "<unknown>";
int main()
size_t dims = 4;
double *h_vec, *h_mat, *h_results;
h_vec = new double[dims];
h_mat = new double[dims * dims];
h_results = new double[dims];
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++)
h_vec[i] = 0.5 * i;
printf("%.2lf ", h_vec[i]);
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++)
h_mat[i] = 1.0 * i;
printf("%.2lf ", h_mat[i]);
if (i % dims == 0)
printf("\n");
printf("\n");
double *d_vec, *d_mat, *d_results;
cudaErrChk( cudaMalloc(&d_vec, dims * sizeof(double)) );
cudaErrChk( cudaMalloc(&d_mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMalloc(&d_results, dims * sizeof(double)) );
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
// copy the data manually to the GPUs
cublasErrChk( cublasSetVector(dims, sizeof(*d_vec), h_vec, 1, d_vec, 1) );
cublasErrChk( cublasSetMatrix(dims, dims, sizeof(double), h_mat, dims, d_mat, dims) );
double alpha = 1.f, beta = 1.f;
// // multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
d_mat, dims,
d_vec, 1,
&beta,
d_results, 1
)
);
cublasErrChk( cublasGetVector(dims, sizeof(*h_results), d_results, 1, h_results, 1) );
for (int i = 0; i < dims; i++)
printf("%.2lf ", h_results[i]);
printf("\n");
cudaErrChk( cudaFree(d_vec) );
cudaErrChk( cudaFree(d_mat) );
cudaErrChk( cudaFree(d_results) );
delete [] h_vec;
delete [] h_mat;
delete [] h_results;
return 0;
编译 nvcc -o main main.cu -lcublas
【问题讨论】:
像 GEMV 这样的 CUBLAS 调用是异步的。在尝试打印结果之前,您需要在 GEMV 调用之后进行同步调用。您正在 GEMV 完成之前打印 请将您的解决方案添加为未来访问者的简短答案。稍后您将能够接受您的答案并将其标记为已回答 【参考方案1】:正如@talonmies 指出的那样,问题在于我使用的是异步调用并且没有及时获得结果。这可以通过在 cublasDgemv() 调用之后添加 cudaDeviceSynchronize() 来解决:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) gpuAssert((ans), __FILE__, __LINE__);
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
static const char *cublasErrChk(cublasStatus_t error)
switch (error)
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
return "<unknown>";
int main()
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++)
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++)
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
printf("\n");
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
double alpha = 1.f, beta = 1.f;
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
cudaDeviceSynchronize();
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
【讨论】:
感谢您抽出宝贵时间以上是关于带有 cuBLAS 的 cudaMallocManaged(统一内存)的主要内容,如果未能解决你的问题,请参考以下文章