0_Simple__cppIntegration
Posted 爨爨爨好
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__cppIntegration相关的知识,希望对你有一定的参考价值。
引用已经编好的 .cu 和 .cpp 代码来混合使用。在 main.cpp 中调用了使用GPU的 cppIntegration.cu (测试函数也在其中) 和使用CPU的 cppIntegration_gold.cpp 。计算的内容是将加密过的字符串 "hello world" 解密并进行显示。
源代码:
1 /*cppIntegration.cu*/ 2 #include <stdlib.h> 3 #include <stdio.h> 4 #include <string.h> 5 #include <math.h> 6 #include <assert.h> 7 #include <cuda_runtime.h> 8 #include <helper_cuda.h> 9 #include <helper_functions.h> 10 11 #ifndef MAX 12 #define MAX(a,b) (a > b ? a : b) 13 #endif 14 15 extern "C" void computeGold(char *reference, char *idata, const unsigned int len); 16 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); 17 18 // GPU上的运算 19 __global__ void kernel(int *g_data)// 使用 int* 格式的输入 20 { 21 const unsigned int tid = threadIdx.x; 22 int data = g_data[tid]; 23 24 // data 的4个字节中,每个字节的值减去10,再拼接到一起 25 g_data[tid] = ((((data << 0) >> 24) - 10) << 24) | 26 ((((data << 8) >> 24) - 10) << 16) | 27 ((((data << 16) >> 24) - 10) << 8) | 28 ((((data << 24) >> 24) - 10) << 0) ; 29 } 30 31 __global__ void kernel2(int2 *g_data)// 使用 int2* 格式的输入 32 { 33 const unsigned int tid = threadIdx.x; 34 int2 data = g_data[tid]; 35 36 // data.x 中每个元素减去 data.y 中对应元素的偏移量 37 g_data[tid].x = data.x - data.y; 38 } 39 40 // 测试不同的核函数处理的结果。输入额外参数,两种格式的待处理数据,及其长度 41 extern "C" bool runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len) 42 { 43 findCudaDevice(argc, (const char **)argv); 44 45 const unsigned int num_threads = len / 4; 46 assert(0 == (len % 4)); // 要求数组长度为4的倍数,否则报错 47 const unsigned int mem_size = sizeof(char) * len; 48 const unsigned int mem_size_int2 = sizeof(int2) * len; 49 50 char *d_data; 51 cudaMalloc((void **)&d_data, mem_size); 52 cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice); 53 int2 *d_data_int2; 54 cudaMalloc((void **)&d_data_int2, mem_size_int2); 55 cudaMemcpy(d_data_int2, data_int2, mem_size_int2, cudaMemcpyHostToDevice); 56 57 dim3 grid(1, 1, 1); 58 dim3 threads(num_threads, 1, 1); 59 dim3 threads2(len, 1, 1); 60 kernel << < grid, threads >> > ((int *)d_data); // 使用GPU计算 61 kernel2 << < grid, threads2 >> > (d_data_int2); 62 63 getLastCudaError("Kernel execution failed"); // 检查和函数运行是否有错误,有错则输出这话 64 65 char *reference = (char *)malloc(mem_size); // 使用CPU计算 66 computeGold(reference, data, len); 67 int2 *reference2 = (int2 *)malloc(mem_size_int2); 68 computeGold2(reference2, data_int2, len); 69 70 cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost); 71 cudaMemcpy(data_int2, d_data_int2, mem_size_int2, cudaMemcpyDeviceToHost); 72 73 bool success = true; // 对比计算结果 74 for (unsigned int i = 0; i < len; i++) 75 { 76 if (reference[i] != data[i] || reference2[i].x != data_int2[i].x || reference2[i].y != data_int2[i].y) 77 success = false; 78 } 79 80 cudaFree(d_data); 81 cudaFree(d_data_int2); 82 free(reference); 83 free(reference2); 84 return success; 85 }
1 /*cppIntegration_gold.cpp*/ 2 #include <vector_types.h> 3 4 extern "C" void computeGold(char *reference, char *idata, const unsigned int len); 5 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); 6 7 // CPU上的运算 8 void computeGold(char *reference, char *idata, const unsigned int len) 9 { 10 for (unsigned int i = 0; i < len; ++i) 11 reference[i] = idata[i] - 10; 12 } 13 14 void computeGold2(int2 *reference, int2 *idata, const unsigned int len) 15 { 16 for (unsigned int i = 0; i < len; ++i) 17 { 18 reference[i].x = idata[i].x - idata[i].y; 19 reference[i].y = idata[i].y; 20 } 21 }
1 /*main.cpp*/ 2 #include <iostream> 3 #include <stdlib.h> 4 #include <cuda_runtime.h> 5 #include <vector_types.h> 6 #include <helper_cuda.h> 7 8 extern "C" bool runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len); 9 10 int main(int argc, char **argv) 11 { 12 13 int len = 16; 14 char str[] = { 82, 111, 118,118,121, 42, 97, 121, 15 124, 118, 110, 56, 10, 10, 10, 10}; 16 17 int2 i2[16];// 使用cuda内置的int2类型 18 for (int i = 0; i < len; i++) 19 { 20 i2[i].x = str[i]; 21 i2[i].y = 10; 22 } 23 24 // 使用不同的方法计算 25 bool bTestResult; 26 bTestResult = runTest(argc, (const char **)argv, str, i2, len); 27 28 // 检查 str 和i2 两种输出中的结果 29 std::cout << str << std::endl; 30 char str_device[16]; 31 for (int i = 0; i < len; i++) 32 str_device[i] = (char)(i2[i].x); 33 std::cout << str_device << std::endl; 34 35 getchar(); 36 exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); 37 }
? 输出结果:
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 Hello World. Hello World.
? 涨姿势:
● cuda内置的int2类型,相当于一个证书的有序对。涉及的定义如下:
1 #define __cuda_builtin_vector_align8(tag, members) 2 struct __device_builtin__ __align__(8) tag 3 { 4 members 5 } 6 7 __cuda_builtin_vector_align8(int2, int x; int y;); 8 9 typedef __device_builtin__ struct int2 int2;
● 警告函数和错误检查函数
1 #define assert(expression) (void) 2 ( 3 (!!(expression)) || (_wassert(_CRT_WIDE(#expression), _CRT_WIDE(__FILE__), (unsigned)(__LINE__)), 0) 4 ) 5 6 #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) 7 inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) 8 { 9 cudaError_t err = cudaGetLastError(); 10 if (cudaSuccess != err) 11 { 12 fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", 13 file, line, errorMessage, (int)err, cudaGetErrorString(err)); 14 DEVICE_RESET 15 exit(EXIT_FAILURE); 16 } 17 }
以上是关于0_Simple__cppIntegration的主要内容,如果未能解决你的问题,请参考以下文章