CUDA中使用多维数组

Posted jourluohua

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA中使用多维数组相关的知识,希望对你有一定的参考价值。

今天想起一个问题,看到的绝大多数CUDA代码都是使用的一维数组,是否可以在CUDA中使用一维数组,这是一个问题,想了各种问题,各种被77的错误状态码和段错误折磨,最后发现有一个cudaMallocManaged函数,这个函数可以很好的组织多维数组的多重指针的形式

,后来发现,这个问题之前在Stack Overflow中就有很好的解决。先贴一下我自己的代码实现:

技术图片
  1 #include "cuda_runtime.h"  
  2 #include "device_launch_parameters.h"  
  3   
  4 #include <stdio.h>  
  5 const int arraySize = 5; 
  6 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
  7   
  8 __global__ void addKernel(int **c, const int *a, const int *b)  
  9 {  
 10     int i = threadIdx.x;  
 11     if(i<arraySize)
 12         c[0][i] = a[i] + b[i];  
 13     else
 14         c[1][i-arraySize]= a[i-arraySize]+b[i-arraySize];
 15 }  
 16 
 17 int main()  
 18 {  
 19  
 20     const int a[arraySize] = { 1, 2, 3, 4, 5 };  
 21     const int b[arraySize] = { 10, 20, 30, 40, 50 };  
 22     int c[arraySize] = { 0 };  
 23   
 24     // Add vectors in parallel.  
 25     cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);  
 26     if (cudaStatus != cudaSuccess) {  
 27         fprintf(stderr, "addWithCuda failed!");  
 28         return 1;  
 29     }  
 30   
 31     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",  
 32         c[0], c[1], c[2], c[3], c[4]);  
 33   
 34     // cudaThreadExit must be called before exiting in order for profiling and  
 35     // tracing tools such as Nsight and Visual Profiler to show complete traces.  
 36     cudaStatus = cudaThreadExit();  
 37     if (cudaStatus != cudaSuccess) {  
 38         fprintf(stderr, "cudaThreadExit failed!");  
 39         return 1;  
 40     }  
 41   
 42     return 0;  
 43 }  
 44   
 45 // Helper function for using CUDA to add vectors in parallel.  
 46 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
 47 {  
 48     int *dev_a = 0;  
 49     int *dev_b = 0;  
 50     int *dev_c0;
 51     int **dev_c ;
 52     cudaError_t cudaStatus;  
 53   
 54     // Choose which GPU to run on, change this on a multi-GPU system.  
 55     cudaStatus = cudaSetDevice(0);  
 56     if (cudaStatus != cudaSuccess) {  
 57         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
 58         goto Error;  
 59     }  
 60   
 61     // Allocate GPU buffers for three vectors (two input, one output)    
 62     cudaStatus  =  cudaMallocManaged(&dev_c, 2*sizeof(int*));
 63     if (cudaStatus != cudaSuccess) {  
 64         fprintf(stderr, "cudaMalloc failed!");  
 65         goto Error;  
 66     } 
 67     cudaStatus = cudaMalloc((void**)&(dev_c0), size * sizeof(int)*2);  
 68     if (cudaStatus != cudaSuccess) {  
 69         fprintf(stderr, "cudaMalloc failed!");  
 70         goto Error;  
 71     }  
 72 
 73     dev_c[0]=dev_c0;
 74     dev_c[1]=dev_c0+arraySize;
 75     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
 76     if (cudaStatus != cudaSuccess) {  
 77         fprintf(stderr, "cudaMalloc failed!");  
 78         goto Error;  
 79     }  
 80   
 81     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
 82     if (cudaStatus != cudaSuccess) {  
 83         fprintf(stderr, "cudaMalloc failed!");  
 84         goto Error;  
 85     }  
 86   
 87     // Copy input vectors from host memory to GPU buffers.  
 88     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
 89     if (cudaStatus != cudaSuccess) {  
 90         fprintf(stderr, "cudaMemcpy failed!");  
 91         goto Error;  
 92     }  
 93   
 94     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
 95     if (cudaStatus != cudaSuccess) {  
 96         fprintf(stderr, "cudaMemcpy failed!");  
 97         goto Error;  
 98     }  
 99   
100     // Launch a kernel on the GPU with one thread for each element.  
101     addKernel<<<1, size*2>>>(dev_c, dev_a, dev_b);  
102   
103     // cudaThreadSynchronize waits for the kernel to finish, and returns  
104     // any errors encountered during the launch.  
105     cudaStatus = cudaThreadSynchronize();  
106     if (cudaStatus != cudaSuccess) {  
107         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
108         goto Error;  
109     }  
110   
111     // Copy output vector from GPU buffer to host memory.  
112     cudaStatus = cudaMemcpy(c, dev_c[1], size * sizeof(int), cudaMemcpyDeviceToHost);  
113     if (cudaStatus != cudaSuccess) {  
114         fprintf(stderr, "cudaMemcpy failed!");  
115         goto Error;  
116     }  
117   
118 Error:  
119 
120     cudaFree(dev_a);  
121     cudaFree(dev_b);  
122       
123     return cudaStatus;  
124 }  
View Code

在别人很好的代码下就不漏丑了,贴一下stack overflow的代码,非常直观易懂

 1 //https://stackoverflow.com/questions/40388242/multidimensional-array-allocation-with-cuda-unified-memory-on-power-8
 2  
 3 #include <iostream>
 4 #include <assert.h>
 5  
 6 template<typename T>
 7 T**** create_4d_flat(int a, int b, int c, int d) {
 8     T *base;
 9     cudaError_t err = cudaMallocManaged(&base, a*b*c*d * sizeof(T));
10     assert(err == cudaSuccess);
11     T ****ary;
12     err = cudaMallocManaged(&ary, (a + a * b + a * b*c) * sizeof(T*));
13     assert(err == cudaSuccess);
14     for (int i = 0; i < a; i++) {
15         ary[i] = (T ***)((ary + a) + i * b);
16         for (int j = 0; j < b; j++) {
17             ary[i][j] = (T **)((ary + a + a * b) + i * b*c + j * c);
18             for (int k = 0; k < c; k++)
19                 ary[i][j][k] = base + ((i*b + j)*c + k)*d;
20         }
21     }
22     return ary;
23 }
24  
25 template<typename T>
26 void free_4d_flat(T**** ary) {
27     if (ary[0][0][0]) cudaFree(ary[0][0][0]);
28     if (ary) cudaFree(ary);
29 }
30  
31  
32 template<typename T>
33 __global__ void fill(T**** data, int a, int b, int c, int d) {
34     unsigned long long int val = 0;
35     for (int i = 0; i < a; i++)
36         for (int j = 0; j < b; j++)
37             for (int k = 0; k < c; k++)
38                 for (int l = 0; l < d; l++)
39                     data[i][j][k][l] = val++;
40 }
41  
42 void report_gpu_mem()
43 {
44     size_t free, total;
45     cudaMemGetInfo(&free, &total);
46     std::cout << "Free = " << free << " Total = " << total << std::endl;
47 }
48  
49 int main() {
50     report_gpu_mem();
51  
52     unsigned long long int ****data2;
53     std::cout << "allocating..." << std::endl;
54     data2 = create_4d_flat<unsigned long long int>(64, 63, 62, 5);
55  
56     report_gpu_mem();
57  
58     fill << <1, 1 >> > (data2, 64, 63, 62, 5);
59     cudaError_t err = cudaDeviceSynchronize();
60     assert(err == cudaSuccess);
61  
62     std::cout << "validating..." << std::endl;
63     for (int i = 0; i < 64 * 63 * 62 * 5; i++)
64         if (*(data2[0][0][0] + i) != i) { std::cout << "mismatch at " << i << " was " << *(data2[0][0][0] + i) << std::endl; return -1; }
65     free_4d_flat(data2);
66     return 0;
67 }

 

以上是关于CUDA中使用多维数组的主要内容,如果未能解决你的问题,请参考以下文章

在 VS2010 中使用 Nvidia NSight 进行 CUDA 性能分析 - 时间线上的片段

RuntimeError: ‘lengths’ argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor(代码片段

使用 ReactJS 显示多维数组

循环多维数组以生成谷歌图表的多维数组

在循环内将数组添加到多维数组

使用foreach打印多维数组[重复]