0_Simple__cudaOpenMP

Posted 爨爨爨好

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__cudaOpenMP相关的知识,希望对你有一定的参考价值。

在OpenMP的多线程程序中,各线程分别调用CUDA进行计算。OpenMP的简单示例。

? 源代码:

 1 #include <omp.h>
 2 #include <stdio.h>
 3 #include "device_launch_parameters.h"
 4 #include <helper_cuda.h>
 5 
 6 using namespace std;
 7 
 8 __global__ void kernelAddConstant(int *g_a, const int b)
 9 {
10     int idx = blockIdx.x * blockDim.x + threadIdx.x;
11     g_a[idx] += b;
12 }
13 
14 int correctResult(int *data, const int n, const int b)
15 {
16     for (int i = 0; i < n; i++)
17     {
18         if (data[i] != i + b)
19             return 0;
20     }
21     return 1;
22 }
23 
24 int main(int argc, char *argv[])
25 {
26     // 检测可用的CUDA GPU数量并输出名字
27     int num_gpus = 0;
28     printf("%s Starting...\n\n", argv[0]);
29     cudaGetDeviceCount(&num_gpus);
30     if (num_gpus < 1)
31     {
32         printf("no CUDA capable devices were detected\n");
33         return 1;
34     }
35     printf("number of host CPUs:\t%d\n", omp_get_num_procs());
36     printf("number of CUDA devices:\t%d\n", num_gpus);
37     for (int i = 0; i < num_gpus; i++)
38     {
39         cudaDeviceProp dprop;
40         cudaGetDeviceProperties(&dprop, i);
41         printf("   %d: %s\n", i, dprop.name);
42     }
43     printf("---------------------------\n");
44 
45     // 初始化计算参数
46     unsigned int n = num_gpus * 8192;
47     unsigned int nbytes = n * sizeof(int);
48     int *a = 0;    
49     int b = 3;      
50     a = (int *)malloc(nbytes);
51     if (0 == a)
52     {
53         printf("couldn‘t allocate CPU memory\n");
54         return 1;
55     }
56     for (unsigned int i = 0; i < n; i++)
57         a[i] = i;
58 
59     // 计算部分
60     omp_set_num_threads(num_gpus);  // 使用CPU线程数量等于GPU设备数量。可以使用更多,如 2*num_gpus
61 
62     #pragma omp parallel
63     {
64         unsigned int cpu_thread_id = omp_get_thread_num();   // 获取当前线程编号
65         unsigned int num_cpu_threads = omp_get_num_threads();// 获取总线程数量
66 
67         int gpu_id = -1;
68         cudaSetDevice(cpu_thread_id % num_gpus);   // 使用 % 使得一个GPU能接受更多CPU线程 
69         cudaGetDevice(&gpu_id);
70         printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
71 
72         int *d_a = 0;
73         int *sub_a = a + cpu_thread_id * n / num_cpu_threads;   // 主机内存分段
74         unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
75         dim3 gpu_threads(128);
76         dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
77         cudaMalloc((void **)&d_a, nbytes_per_kernel);
78         cudaMemset(d_a, 0, nbytes_per_kernel);
79         cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice);
80         kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
81         cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost);
82         cudaFree(d_a);
83     }
84     printf("---------------------------\n");
85 
86     // 检查结果
87     if (cudaSuccess != cudaGetLastError())
88         printf("%s\n", cudaGetErrorString(cudaGetLastError()));
89     bool bResult = correctResult(a, n, b);
90     if (a)
91         free(a); // free CPU memory
92 
93     getchar();
94     exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
95 }

 

? 输出结果:

D:\Code\CUDA\ProjectTemp\x64\Debug\ProjectTemp.exe Starting...

number of host CPUs:    8
number of CUDA devices: 1
   0: GeForce GTX 1070
---------------------------
CPU thread 0 (of 1) uses CUDA device 0
---------------------------

 

? 涨姿势

● 使用OpenMP的简单范例:

需要头文件 #include <omp.h> 

使用并行的代码块

1 omp_set_num_threads(threads);// 设置线程数量
2 #pragma omp parallel
3 {
4     unsigned int cpu_thread_id = omp_get_thread_num();   // 获取当前线程编号
5     unsigned int num_cpu_threads = omp_get_num_threads();// 获取总线程数量
6     ...
7 }

涉及的函数:

1 _OMPIMP void _OMPAPI omp_set_num_threads(int _Num_threads); // 设置线程数量
2 
3 _OMPIMP int _OMPAPI omp_get_thread_num(void);               // 获取当前线程编号
4 
5 _OMPIMP int _OMPAPI omp_get_num_threads(void);              // 获取总线程数量

 

以上是关于0_Simple__cudaOpenMP的主要内容,如果未能解决你的问题,请参考以下文章

0_Simple__asyncAPI

0_Simple__cppIntegration

0_Simple__cppOverload

0_Simple__simpleCallback

0_Simple__cdpSimpleQuicksort

0_Simple__clock