cuda并行计算的几种模式

Posted Tequila Sunrise

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了cuda并行计算的几种模式相关的知识,希望对你有一定的参考价值。

  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 #include <stdio.h>
  4 #include <time.h>
  5 #include <stdlib.h>
  6 
  7 #define MAX 120
  8 #define MIN 0
  9 
 10 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
 11         float* etime);
 12 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
 13         float* etime, int type);
 14 __global__ void addKernel(int *c, const int *a, const int *b) {
 15     int i = blockIdx.x;
 16     c[i] = a[i] + b[i];
 17 }
 18 
 19 __global__ void addKernelThread(int *c, const int *a, const int *b) {
 20     int i = threadIdx.x;
 21     c[i] = a[i] + b[i];
 22 }
 23 int main() {
 24     const int arraySize = 800;
 25     srand((unsigned) time(NULL));
 26     int a[arraySize] = { 1, 2, 3, 4, 5 };
 27     int b[arraySize] = { 10, 20, 30, 40, 50 };
 28 
 29     for (int i = 0; i < arraySize; i++) {
 30         a[i] = rand() % (MAX + 1 - MIN) + MIN;
 31         b[i] = rand() % (MAX + 1 - MIN) + MIN;
 32     }
 33     int c[arraySize] = { 0 };
 34     // Add vectors in parallel.
 35     cudaError_t cudaStatus;
 36     int num = 0;
 37     cudaDeviceProp prop;
 38     cudaStatus = cudaGetDeviceCount(&num);
 39     for (int i = 0; i < num; i++) {
 40         cudaGetDeviceProperties(&prop, i);
 41     }
 42 
 43     float time;
 44     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
 45     printf("Elasped time of stream is : %f \\n", time);
 46     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\\n",
 47             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 48             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 49             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 50             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 51             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 52     if (cudaStatus != cudaSuccess) {
 53         fprintf(stderr, "addWithCudaStream failed!");
 54         return 1;
 55     }
 56     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 0);
 57     printf("Elasped time of Block is : %f \\n", time);
 58     if (cudaStatus != cudaSuccess) {
 59         fprintf(stderr, "addWithCudaStream failed!");
 60         return 1;
 61     }
 62     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\\n",
 63             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 64             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 65             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 66             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 67             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 68 
 69     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 1);
 70     printf("Elasped time of thread is : %f \\n", time);
 71     if (cudaStatus != cudaSuccess) {
 72         fprintf(stderr, "addWithCudaStream failed!");
 73         return 1;
 74     }
 75     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\\n",
 76             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 77             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 78             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 79             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 80             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 81 
 82     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time);
 83     printf("Elasped time of stream is : %f \\n", time);
 84     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\\n",
 85             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2],
 86             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0],
 87             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3],
 88             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1],
 89             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]);
 90     if (cudaStatus != cudaSuccess) {
 91         fprintf(stderr, "addWithCudaStream failed!");
 92         return 1;
 93     }
 94     // cudaThreadExit must be called before exiting in order for profiling and
 95     // tracing tools such as Nsight and Visual Profiler to show complete traces.
 96     cudaStatus = cudaThreadExit();
 97     if (cudaStatus != cudaSuccess) {
 98         fprintf(stderr, "cudaThreadExit failed!");
 99         return 1;
100     }
101     return 0;
102 }
103 // Helper function for using CUDA to add vectors in parallel.
104 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,
105         float* etime) {
106     int *dev_a = 0;
107     int *dev_b = 0;
108     int *dev_c = 0;
109     clock_t start, stop;
110     float time;
111     cudaError_t cudaStatus;
112 
113     // Choose which GPU to run on, change this on a multi-GPU system.
114     cudaStatus = cudaSetDevice(0);
115     if (cudaStatus != cudaSuccess) {
116         fprintf(stderr,
117                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
118         goto Error;
119     }
120     // Allocate GPU buffers for three vectors (two input, one output)    .
121     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
122     if (cudaStatus != cudaSuccess) {
123         fprintf(stderr, "cudaMalloc failed!");
124         goto Error;
125     }
126     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
127     if (cudaStatus != cudaSuccess) {
128         fprintf(stderr, "cudaMalloc failed!");
129         goto Error;
130     }
131     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
132     if (cudaStatus != cudaSuccess) {
133         fprintf(stderr, "cudaMalloc failed!");
134         goto Error;
135     }
136     // Copy input vectors from host memory to GPU buffers.
137     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
138             cudaMemcpyHostToDevice);
139     if (cudaStatus != cudaSuccess) {
140         fprintf(stderr, "cudaMemcpy failed!");
141         goto Error;
142     }
143     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
144             cudaMemcpyHostToDevice);
145     if (cudaStatus != cudaSuccess) {
146         fprintf(stderr, "cudaMemcpy failed!");
147         goto Error;
148     }
149     cudaStream_t stream[5];
150     for (int i = 0; i < 5; i++) {
151         cudaStreamCreate(&stream[i]);   //创建流
152     }
153     // Launch a kernel on the GPU with one thread for each element.
154     for (int i = 0; i < 5; i++) {
155         addKernel<<<1, 1, 0, stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流
156     }
157     start = clock();
158     cudaDeviceSynchronize();
159     stop = clock();
160     time = (float) (stop - start) / CLOCKS_PER_SEC;
161     *etime = time;
162     // cudaThreadSynchronize waits for the kernel to finish, and returns
163     // any errors encountered during the launch.
164     cudaStatus = cudaThreadSynchronize();
165     if (cudaStatus != cudaSuccess) {
166         fprintf(stderr,
167                 "cudaThreadSynchronize returned error code %d after launching addKernel!\\n",
168                 cudaStatus);
169         goto Error;
170     }
171     // Copy output vector from GPU buffer to host memory.
172     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
173             cudaMemcpyDeviceToHost);
174     if (cudaStatus != cudaSuccess) {
175         fprintf(stderr, "cudaMemcpy failed!");
176         goto Error;
177     }
178     Error: for (int i = 0; i < 5; i++) {
179         cudaStreamDestroy(stream[i]);   //销毁流
180     }
181     cudaFree(dev_c);
182     cudaFree(dev_a);
183     cudaFree(dev_b);
184     return cudaStatus;
185 }
186 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,
187         float * etime, int type) {
188     int *dev_a = 0;
189     int *dev_b = 0;
190     int *dev_c = 0;
191     clock_t start, stop;
192     float time;
193     cudaError_t cudaStatus;
194 
195     // Choose which GPU to run on, change this on a multi-GPU system.
196     cudaStatus = cudaSetDevice(0);
197     if (cudaStatus != cudaSuccess) {
198         fprintf(stderr,
199                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
200         goto Error;
201     }
202     // Allocate GPU buffers for three vectors (two input, one output)    .
203     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));
204     if (cudaStatus != cudaSuccess) {
205         fprintf(stderr, "cudaMalloc failed!");
206         goto Error;
207     }
208     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));
209     if (cudaStatus != cudaSuccess) {
210         fprintf(stderr, "cudaMalloc failed!");
211         goto Error;
212     }
213     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));
214     if (cudaStatus != cudaSuccess) {
215         fprintf(stderr, "cudaMalloc failed!");
216         goto Error;
217     }
218     // Copy input vectors from host memory to GPU buffers.
219     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),
220             cudaMemcpyHostToDevice);
221     if (cudaStatus != cudaSuccess) {
222         fprintf(stderr, "cudaMemcpy failed!");
223         goto Error;
224     }
225     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),
226             cudaMemcpyHostToDevice);
227     if (cudaStatus != cudaSuccess) {
228         fprintf(stderr, "cudaMemcpy failed!");
229         goto Error;
230     }
231 
232     if (type == 0) {
233         start = clock();
234         addKernel<<<size, 1>>>(dev_c, dev_a, dev_b);
235     } else {
236         start = clock();
237         addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);
238     }
239     stop = clock();
240     time = (float) (stop - start) / CLOCKS_PER_SEC;
241     *etime = time;
242     // cudaThreadSynchronize waits for the kernel to finish, and returns
243     // any errors encountered during the launch.
244     cudaStatus = cudaThreadSynchronize();
245     if (cudaStatus != cudaSuccess) {
246         fprintf(stderr,
247                 "cudaThreadSynchronize returned error code %d after launching addKernel!\\n",
248                 cudaStatus);
249         goto Error;
250     }
251     // Copy output vector from GPU buffer to host memory.
252     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),
253             cudaMemcpyDeviceToHost);
254     if (cudaStatus != cudaSuccess) {
255         fprintf(stderr, "cudaMemcpy failed!");
256         goto Error;
257     }
258     Error: cudaFree(dev_c);
259     cudaFree(dev_a);
260     cudaFree(dev_b);
261     return cudaStatus;
262 }
View Code

如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?

Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}

以上是关于cuda并行计算的几种模式的主要内容,如果未能解决你的问题,请参考以下文章

CUDA中的几种并行计算模型

并行计算程序设计(CUDA C)

Flink设置并行度的几种方式

Laravel:如何在控制器的几种方法中重用代码片段

如何使用CUDA并行化嵌套for循环以在2D数组上执行计算

使用 OpenMP 在 C、C++ 中并行化嵌套 for 循环的几种方法之间的区别