CUDA中使用多维数组
Posted jourluohua
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA中使用多维数组相关的知识,希望对你有一定的参考价值。
今天想起一个问题,看到的绝大多数CUDA代码都是使用的一维数组,是否可以在CUDA中使用一维数组,这是一个问题,想了各种问题,各种被77的错误状态码和段错误折磨,最后发现有一个cudaMallocManaged函数,这个函数可以很好的组织多维数组的多重指针的形式
,后来发现,这个问题之前在Stack Overflow中就有很好的解决。先贴一下我自己的代码实现:
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 4 #include <stdio.h> 5 const int arraySize = 5; 6 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size); 7 8 __global__ void addKernel(int **c, const int *a, const int *b) 9 { 10 int i = threadIdx.x; 11 if(i<arraySize) 12 c[0][i] = a[i] + b[i]; 13 else 14 c[1][i-arraySize]= a[i-arraySize]+b[i-arraySize]; 15 } 16 17 int main() 18 { 19 20 const int a[arraySize] = { 1, 2, 3, 4, 5 }; 21 const int b[arraySize] = { 10, 20, 30, 40, 50 }; 22 int c[arraySize] = { 0 }; 23 24 // Add vectors in parallel. 25 cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); 26 if (cudaStatus != cudaSuccess) { 27 fprintf(stderr, "addWithCuda failed!"); 28 return 1; 29 } 30 31 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", 32 c[0], c[1], c[2], c[3], c[4]); 33 34 // cudaThreadExit must be called before exiting in order for profiling and 35 // tracing tools such as Nsight and Visual Profiler to show complete traces. 36 cudaStatus = cudaThreadExit(); 37 if (cudaStatus != cudaSuccess) { 38 fprintf(stderr, "cudaThreadExit failed!"); 39 return 1; 40 } 41 42 return 0; 43 } 44 45 // Helper function for using CUDA to add vectors in parallel. 46 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size) 47 { 48 int *dev_a = 0; 49 int *dev_b = 0; 50 int *dev_c0; 51 int **dev_c ; 52 cudaError_t cudaStatus; 53 54 // Choose which GPU to run on, change this on a multi-GPU system. 55 cudaStatus = cudaSetDevice(0); 56 if (cudaStatus != cudaSuccess) { 57 fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 58 goto Error; 59 } 60 61 // Allocate GPU buffers for three vectors (two input, one output) 62 cudaStatus = cudaMallocManaged(&dev_c, 2*sizeof(int*)); 63 if (cudaStatus != cudaSuccess) { 64 fprintf(stderr, "cudaMalloc failed!"); 65 goto Error; 66 } 67 cudaStatus = cudaMalloc((void**)&(dev_c0), size * sizeof(int)*2); 68 if (cudaStatus != cudaSuccess) { 69 fprintf(stderr, "cudaMalloc failed!"); 70 goto Error; 71 } 72 73 dev_c[0]=dev_c0; 74 dev_c[1]=dev_c0+arraySize; 75 cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); 76 if (cudaStatus != cudaSuccess) { 77 fprintf(stderr, "cudaMalloc failed!"); 78 goto Error; 79 } 80 81 cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); 82 if (cudaStatus != cudaSuccess) { 83 fprintf(stderr, "cudaMalloc failed!"); 84 goto Error; 85 } 86 87 // Copy input vectors from host memory to GPU buffers. 88 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 89 if (cudaStatus != cudaSuccess) { 90 fprintf(stderr, "cudaMemcpy failed!"); 91 goto Error; 92 } 93 94 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); 95 if (cudaStatus != cudaSuccess) { 96 fprintf(stderr, "cudaMemcpy failed!"); 97 goto Error; 98 } 99 100 // Launch a kernel on the GPU with one thread for each element. 101 addKernel<<<1, size*2>>>(dev_c, dev_a, dev_b); 102 103 // cudaThreadSynchronize waits for the kernel to finish, and returns 104 // any errors encountered during the launch. 105 cudaStatus = cudaThreadSynchronize(); 106 if (cudaStatus != cudaSuccess) { 107 fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 108 goto Error; 109 } 110 111 // Copy output vector from GPU buffer to host memory. 112 cudaStatus = cudaMemcpy(c, dev_c[1], size * sizeof(int), cudaMemcpyDeviceToHost); 113 if (cudaStatus != cudaSuccess) { 114 fprintf(stderr, "cudaMemcpy failed!"); 115 goto Error; 116 } 117 118 Error: 119 120 cudaFree(dev_a); 121 cudaFree(dev_b); 122 123 return cudaStatus; 124 }
在别人很好的代码下就不漏丑了,贴一下stack overflow的代码,非常直观易懂
1 //https://stackoverflow.com/questions/40388242/multidimensional-array-allocation-with-cuda-unified-memory-on-power-8 2 3 #include <iostream> 4 #include <assert.h> 5 6 template<typename T> 7 T**** create_4d_flat(int a, int b, int c, int d) { 8 T *base; 9 cudaError_t err = cudaMallocManaged(&base, a*b*c*d * sizeof(T)); 10 assert(err == cudaSuccess); 11 T ****ary; 12 err = cudaMallocManaged(&ary, (a + a * b + a * b*c) * sizeof(T*)); 13 assert(err == cudaSuccess); 14 for (int i = 0; i < a; i++) { 15 ary[i] = (T ***)((ary + a) + i * b); 16 for (int j = 0; j < b; j++) { 17 ary[i][j] = (T **)((ary + a + a * b) + i * b*c + j * c); 18 for (int k = 0; k < c; k++) 19 ary[i][j][k] = base + ((i*b + j)*c + k)*d; 20 } 21 } 22 return ary; 23 } 24 25 template<typename T> 26 void free_4d_flat(T**** ary) { 27 if (ary[0][0][0]) cudaFree(ary[0][0][0]); 28 if (ary) cudaFree(ary); 29 } 30 31 32 template<typename T> 33 __global__ void fill(T**** data, int a, int b, int c, int d) { 34 unsigned long long int val = 0; 35 for (int i = 0; i < a; i++) 36 for (int j = 0; j < b; j++) 37 for (int k = 0; k < c; k++) 38 for (int l = 0; l < d; l++) 39 data[i][j][k][l] = val++; 40 } 41 42 void report_gpu_mem() 43 { 44 size_t free, total; 45 cudaMemGetInfo(&free, &total); 46 std::cout << "Free = " << free << " Total = " << total << std::endl; 47 } 48 49 int main() { 50 report_gpu_mem(); 51 52 unsigned long long int ****data2; 53 std::cout << "allocating..." << std::endl; 54 data2 = create_4d_flat<unsigned long long int>(64, 63, 62, 5); 55 56 report_gpu_mem(); 57 58 fill << <1, 1 >> > (data2, 64, 63, 62, 5); 59 cudaError_t err = cudaDeviceSynchronize(); 60 assert(err == cudaSuccess); 61 62 std::cout << "validating..." << std::endl; 63 for (int i = 0; i < 64 * 63 * 62 * 5; i++) 64 if (*(data2[0][0][0] + i) != i) { std::cout << "mismatch at " << i << " was " << *(data2[0][0][0] + i) << std::endl; return -1; } 65 free_4d_flat(data2); 66 return 0; 67 }
以上是关于CUDA中使用多维数组的主要内容,如果未能解决你的问题,请参考以下文章
在 VS2010 中使用 Nvidia NSight 进行 CUDA 性能分析 - 时间线上的片段
RuntimeError: ‘lengths’ argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor(代码片段