0_Simple__cppOverload
Posted 爨爨爨好
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__cppOverload相关的知识,希望对你有一定的参考价值。
使用cuda内质结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量。
? 源代码:
1 /*cppOverload_kernel.cuh*/ 2 __global__ void simple_kernel(const int *pIn, int *pOut, int a) 3 { 4 __shared__ int sData[THREAD_N]; 5 int tid = threadIdx.x + blockDim.x*blockIdx.x; 6 7 sData[threadIdx.x] = pIn[tid]; 8 __syncthreads(); 9 pOut[tid] = sData[threadIdx.x]*a + tid; 10 } 11 12 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a) 13 { 14 __shared__ int2 sData[THREAD_N]; 15 int tid = threadIdx.x + blockDim.x*blockIdx.x; 16 17 sData[threadIdx.x] = pIn[tid]; 18 __syncthreads(); 19 pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y)*a + tid; 20 } 21 22 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) 23 { 24 __shared__ int sData1[THREAD_N]; 25 __shared__ int sData2[THREAD_N]; 26 int tid = threadIdx.x + blockDim.x*blockIdx.x; 27 28 sData1[threadIdx.x] = pIn1[tid]; 29 sData2[threadIdx.x] = pIn2[tid]; 30 __syncthreads(); 31 pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid; 32 }
1 /*cppOverload.cu*/ 2 #include <stdio.h> 3 #include <helper_cuda.h> 4 #include <helper_string.h> 5 #include <helper_math.h> 6 #include "cppOverload_kernel.cuh" 7 8 #define THREAD_N 256 9 #define N 1024 10 #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) 11 #define OUTPUT_ATTR(attr) 12 printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); 13 printf("Constant Size: %d\n", (int)attr.constSizeBytes); 14 printf("Local Size: %d\n", (int)attr.localSizeBytes); 15 printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); 16 printf("Number of Registers: %d\n", attr.numRegs); 17 printf("PTX Version: %d\n", attr.ptxVersion); 18 printf("Binary Version: %d\n", attr.binaryVersion); 19 20 const char *sampleName = "C++ Function Overloading"; 21 22 bool check_func1(int *hInput, int *hOutput, int a) 23 { 24 for (int i = 0; i < N; ++i) 25 { 26 int cpuRes = hInput[i]*a + i; 27 if (hOutput[i] != cpuRes) 28 return false; 29 } 30 return true; 31 } 32 33 bool check_func2(int2 *hInput, int *hOutput, int a) 34 { 35 for (int i = 0; i < N; i++) 36 { 37 int cpuRes = (hInput[i].x + hInput[i].y)*a + i; 38 if (hOutput[i] != cpuRes) 39 return false; 40 } 41 return true; 42 } 43 44 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a) 45 { 46 for (int i = 0; i < N; i++) 47 { 48 if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i) 49 return false; 50 } 51 return true; 52 } 53 54 int main(int argc, const char *argv[]) 55 { 56 int *hInput = NULL; 57 int *hOutput = NULL; 58 int *dInput = NULL; 59 int *dOutput = NULL; 60 61 printf("%s starting...\n", sampleName); 62 63 int deviceCount; 64 cudaGetDeviceCount(&deviceCount); 65 printf("DevicecheckCudaErrors Count: %d\n", deviceCount); 66 int deviceID = findCudaDevice(argc, argv); 67 cudaDeviceProp prop; 68 cudaGetDeviceProperties(&prop, deviceID); 69 if (prop.major < 2) 70 { 71 printf("ERROR: cppOverload requires GPU devices with compute SM 2.0 or higher.\n"); 72 printf("Current GPU device has compute SM%d.%d, Exiting...", prop.major, prop.minor); 73 exit(EXIT_WAIVED); 74 } 75 cudaSetDevice(deviceID); 76 77 cudaMalloc(&dInput , sizeof(int)*N*2); 78 cudaMalloc(&dOutput, sizeof(int)*N); 79 cudaMallocHost(&hInput , sizeof(int)*N*2); 80 cudaMallocHost(&hOutput, sizeof(int)*N); 81 for (int i = 0; i < N*2; i++) 82 hInput[i] = i; 83 cudaMemcpy(dInput, hInput, sizeof(int)*N*2, cudaMemcpyHostToDevice); 84 85 bool testResult = true; 86 bool funcResult = true; 87 int a = 1; 88 89 void (*func1)(const int *, int *, int); 90 void (*func2)(const int2 *, int *, int); 91 void (*func3)(const int *, const int *, int *, int); 92 struct cudaFuncAttributes attr; 93 94 // overload function 1 95 func1 = simple_kernel; 96 memset(&attr, 0, sizeof(attr)); 97 cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared); 98 cudaFuncGetAttributes(&attr, *func1); 99 OUTPUT_ATTR(attr); 100 (*func1)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dOutput, a); 101 cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost); 102 funcResult = check_func1(hInput, hOutput, a); 103 printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED"); 104 testResult &= funcResult; 105 106 // overload function 2 107 func2 = simple_kernel; 108 memset(&attr, 0, sizeof(attr)); 109 cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared); 110 cudaFuncGetAttributes(&attr, *func2); 111 OUTPUT_ATTR(attr); 112 (*func2)<<<DIV_UP(N, THREAD_N), THREAD_N>>>((int2 *)dInput, dOutput, a); 113 cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost); 114 funcResult = check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a); 115 printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED"); 116 testResult &= funcResult; 117 118 // overload function 3 119 func3 = simple_kernel; 120 memset(&attr, 0, sizeof(attr)); 121 cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared); 122 cudaFuncGetAttributes(&attr, *func3); 123 OUTPUT_ATTR(attr); 124 (*func3)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dInput+N, dOutput, a); 125 cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost); 126 funcResult = check_func3(&hInput[0], &hInput[N], hOutput, a); 127 printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED"); 128 testResult &= funcResult; 129 130 cudaFree(dInput); 131 cudaFree(dOutput); 132 cudaFreeHost(hOutput); 133 cudaFreeHost(hInput); 134 135 cudaDeviceSynchronize(); 136 137 getchar(); 138 exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); 139 }
? 输出结果:
C++ Function Overloading starting... DevicecheckCudaErrors Count: 1 GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 Shared Size: 1024 Constant Size: 0 Local Size: 0 Max Threads Per Block: 1024 Number of Registers: 13 PTX Version: 50 Binary Version: 61 simple_kernel(const int *pIn, int *pOut, int a) PASSED Shared Size: 2048 Constant Size: 0 Local Size: 0 Max Threads Per Block: 1024 Number of Registers: 13 PTX Version: 50 Binary Version: 61 simple_kernel(const int2 *pIn, int *pOut, int a) PASSED Shared Size: 2048 Constant Size: 0 Local Size: 0 Max Threads Per Block: 1024 Number of Registers: 15 PTX Version: 50 Binary Version: 61 simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED
? 涨姿势:
● 使用扩展名为 .cuh 的头文件
● cuda内置结构 cudaFuncAttributes 的定义:
1 struct __device_builtin__ cudaFuncAttributes 2 { 3 size_t sharedSizeBytes; // 共享内存大小 4 size_t constSizeBytees; // 常量内存大小 5 size_t localSizeBytes; // 局部内存大小 6 int maxThreadsPerBlock; // 每线程块线最大程数量 7 int numRegs; // 寄存器数量 8 int ptxVersion; // PTX版本号 9 int binaryVersion; // 机器码版本号 10 int cacheModeCA; // 是否使用编译指令 -Xptxas --dlcm=ca 11 };
● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量
1 struct cudaFuncAttributes attr; 2 memset(&attr, 0, sizeof(attr)); 3 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared); 4 cudaFuncGetAttributes(&attr, *function);
涉及的函数
1 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig); 2 3 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 4 { 5 return cudaErrorUnknown; 6 } 7 8 #define OUTPUT_ATTR(attr) 9 printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); 10 printf("Constant Size: %d\n", (int)attr.constSizeBytes); 11 printf("Local Size: %d\n", (int)attr.localSizeBytes); 12 printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); 13 printf("Number of Registers: %d\n", attr.numRegs); 14 printf("PTX Version: %d\n", attr.ptxVersion); 15 printf("Binary Version: %d\n", attr.binaryVersion);
以上是关于0_Simple__cppOverload的主要内容,如果未能解决你的问题,请参考以下文章