0_Simple__cppIntegration

Posted 爨爨爨好

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__cppIntegration相关的知识,希望对你有一定的参考价值。

引用已经编好的 .cu 和 .cpp 代码来混合使用。在 main.cpp 中调用了使用GPU的 cppIntegration.cu (测试函数也在其中) 和使用CPU的 cppIntegration_gold.cpp 。计算的内容是将加密过的字符串 "hello world" 解密并进行显示。

源代码:

 1 /*cppIntegration.cu*/
 2 #include <stdlib.h>
 3 #include <stdio.h>
 4 #include <string.h>
 5 #include <math.h>
 6 #include <assert.h>
 7 #include <cuda_runtime.h>
 8 #include <helper_cuda.h>
 9 #include <helper_functions.h>
10 
11 #ifndef MAX
12 #define MAX(a,b) (a > b ? a : b)
13 #endif
14 
15 extern "C" void computeGold(char *reference, char *idata, const unsigned int len);
16 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);
17 
18 // GPU上的运算
19 __global__ void kernel(int *g_data)// 使用 int* 格式的输入
20 {
21     const unsigned int tid = threadIdx.x;
22     int data = g_data[tid];
23 
24     // data 的4个字节中,每个字节的值减去10,再拼接到一起
25     g_data[tid] = ((((data <<  0) >> 24) - 10) << 24) |
26                   ((((data <<  8) >> 24) - 10) << 16) |
27                   ((((data << 16) >> 24) - 10) <<  8) |
28                   ((((data << 24) >> 24) - 10) <<  0) ;
29 }
30 
31 __global__ void kernel2(int2 *g_data)// 使用 int2* 格式的输入
32 {
33     const unsigned int tid = threadIdx.x;
34     int2 data = g_data[tid];
35 
36     // data.x 中每个元素减去 data.y 中对应元素的偏移量
37     g_data[tid].x = data.x - data.y;
38 }
39 
40 // 测试不同的核函数处理的结果。输入额外参数,两种格式的待处理数据,及其长度
41 extern "C" bool runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len)
42 {
43     findCudaDevice(argc, (const char **)argv);
44 
45     const unsigned int num_threads = len / 4;
46     assert(0 == (len % 4));                                 // 要求数组长度为4的倍数,否则报错
47     const unsigned int mem_size = sizeof(char) * len;
48     const unsigned int mem_size_int2 = sizeof(int2) * len;
49 
50     char *d_data;
51     cudaMalloc((void **)&d_data, mem_size);
52     cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice);
53     int2 *d_data_int2;
54     cudaMalloc((void **)&d_data_int2, mem_size_int2);
55     cudaMemcpy(d_data_int2, data_int2, mem_size_int2, cudaMemcpyHostToDevice);
56 
57     dim3 grid(1, 1, 1);
58     dim3 threads(num_threads, 1, 1);
59     dim3 threads2(len, 1, 1);
60     kernel << < grid, threads >> > ((int *)d_data);         // 使用GPU计算
61     kernel2 << < grid, threads2 >> > (d_data_int2);
62 
63     getLastCudaError("Kernel execution failed");            // 检查和函数运行是否有错误,有错则输出这话
64 
65     char *reference = (char *)malloc(mem_size);             // 使用CPU计算
66     computeGold(reference, data, len);
67     int2 *reference2 = (int2 *)malloc(mem_size_int2);
68     computeGold2(reference2, data_int2, len);
69 
70     cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost);
71     cudaMemcpy(data_int2, d_data_int2, mem_size_int2, cudaMemcpyDeviceToHost);
72 
73     bool success = true;                                   // 对比计算结果
74     for (unsigned int i = 0; i < len; i++)
75     {
76         if (reference[i] != data[i] || reference2[i].x != data_int2[i].x || reference2[i].y != data_int2[i].y)
77             success = false;
78     }
79 
80     cudaFree(d_data);
81     cudaFree(d_data_int2);
82     free(reference);
83     free(reference2);
84     return success;
85 }
 1 /*cppIntegration_gold.cpp*/
 2 #include <vector_types.h>
 3 
 4 extern "C" void computeGold(char *reference, char *idata, const unsigned int len);
 5 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);
 6 
 7 // CPU上的运算
 8 void computeGold(char *reference, char *idata, const unsigned int len)
 9 {
10     for (unsigned int i = 0; i < len; ++i)
11         reference[i] = idata[i] - 10;
12 }
13 
14 void computeGold2(int2 *reference, int2 *idata, const unsigned int len)
15 {
16     for (unsigned int i = 0; i < len; ++i)
17     {
18         reference[i].x = idata[i].x - idata[i].y;
19         reference[i].y = idata[i].y;
20     }
21 }
 1 /*main.cpp*/
 2 #include <iostream>
 3 #include <stdlib.h>
 4 #include <cuda_runtime.h>
 5 #include <vector_types.h>
 6 #include <helper_cuda.h>
 7 
 8 extern "C" bool runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len);
 9 
10 int main(int argc, char **argv)
11 {
12 
13     int len = 16;
14     char str[] = { 82, 111, 118,118,121, 42, 97, 121, 
15                   124, 118, 110, 56, 10, 10, 10,  10};
16 
17     int2 i2[16];// 使用cuda内置的int2类型
18     for (int i = 0; i < len; i++)
19     {
20         i2[i].x = str[i];
21         i2[i].y = 10;
22     }
23 
24     // 使用不同的方法计算
25     bool bTestResult;
26     bTestResult = runTest(argc, (const char **)argv, str, i2, len);
27     
28     // 检查 str 和i2 两种输出中的结果
29     std::cout << str << std::endl;
30     char str_device[16];
31     for (int i = 0; i < len; i++)
32         str_device[i] = (char)(i2[i].x);
33     std::cout << str_device << std::endl;
34 
35     getchar();
36     exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
37 }

 

? 输出结果:

GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

Hello World.
Hello World.

 

? 涨姿势:

● cuda内置的int2类型,相当于一个证书的有序对。涉及的定义如下:

1 #define __cuda_builtin_vector_align8(tag, members)  2     struct __device_builtin__ __align__(8) tag      3     {                                               4         members                                     5     }
6 
7 __cuda_builtin_vector_align8(int2, int x; int y;);
8 
9 typedef __device_builtin__ struct int2 int2;

 

● 警告函数和错误检查函数

 1 #define assert(expression) (void)                                                                    2 (                                                                                                    3 (!!(expression)) || (_wassert(_CRT_WIDE(#expression), _CRT_WIDE(__FILE__), (unsigned)(__LINE__)), 0) 4 )
 5 
 6 #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
 7 inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
 8 {
 9     cudaError_t err = cudaGetLastError();
10     if (cudaSuccess != err)
11     {
12         fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
13             file, line, errorMessage, (int)err, cudaGetErrorString(err));
14         DEVICE_RESET
15         exit(EXIT_FAILURE);
16     }
17 }

 

以上是关于0_Simple__cppIntegration的主要内容,如果未能解决你的问题,请参考以下文章

0_Simple__asyncAPI

0_Simple__cppOverload

0_Simple__cudaOpenMP

0_Simple__simpleCallback

0_Simple__cdpSimpleQuicksort

0_Simple__clock