0_Simple__cppOverload

Posted 爨爨爨好

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__cppOverload相关的知识,希望对你有一定的参考价值。

使用cuda内质结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量。

? 源代码:

 1 /*cppOverload_kernel.cuh*/
 2 __global__ void simple_kernel(const int *pIn, int *pOut, int a)
 3 {
 4     __shared__ int sData[THREAD_N];
 5     int tid = threadIdx.x + blockDim.x*blockIdx.x;
 6 
 7     sData[threadIdx.x] = pIn[tid];
 8     __syncthreads();
 9     pOut[tid] = sData[threadIdx.x]*a + tid;
10 }
11 
12 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)
13 {
14     __shared__ int2 sData[THREAD_N];
15     int tid = threadIdx.x + blockDim.x*blockIdx.x;
16 
17     sData[threadIdx.x] = pIn[tid];
18     __syncthreads();
19     pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y)*a + tid;
20 }
21 
22 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)
23 {
24     __shared__ int sData1[THREAD_N];
25     __shared__ int sData2[THREAD_N];
26     int tid = threadIdx.x + blockDim.x*blockIdx.x;
27 
28     sData1[threadIdx.x] = pIn1[tid];
29     sData2[threadIdx.x] = pIn2[tid];
30     __syncthreads();
31     pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;
32 }
  1 /*cppOverload.cu*/
  2 #include <stdio.h>
  3 #include <helper_cuda.h>
  4 #include <helper_string.h>
  5 #include <helper_math.h>
  6 #include "cppOverload_kernel.cuh"
  7 
  8 #define THREAD_N 256
  9 #define N 1024
 10 #define DIV_UP(a, b) (((a) + (b) - 1) / (b))
 11 #define OUTPUT_ATTR(attr)                                            12     printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);        13     printf("Constant Size: %d\n", (int)attr.constSizeBytes);         14     printf("Local Size:    %d\n", (int)attr.localSizeBytes);         15     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock);  16     printf("Number of Registers: %d\n", attr.numRegs);               17     printf("PTX Version: %d\n", attr.ptxVersion);                    18     printf("Binary Version: %d\n", attr.binaryVersion);             
 19      
 20 const char *sampleName = "C++ Function Overloading";
 21 
 22 bool check_func1(int *hInput, int *hOutput, int a)
 23 {
 24     for (int i = 0; i < N; ++i)
 25     {
 26         int cpuRes = hInput[i]*a + i;
 27         if (hOutput[i] != cpuRes)
 28             return false;
 29     }
 30     return true;
 31 }
 32 
 33 bool check_func2(int2 *hInput, int *hOutput, int a)
 34 {
 35     for (int i = 0; i < N; i++)
 36     {
 37         int cpuRes = (hInput[i].x + hInput[i].y)*a + i;
 38         if (hOutput[i] != cpuRes)
 39             return false;
 40     }
 41     return true;
 42 }
 43 
 44 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)
 45 {
 46     for (int i = 0; i < N; i++)
 47     {
 48         if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)
 49             return false;
 50     }
 51     return true;
 52 }
 53 
 54 int main(int argc, const char *argv[])
 55 {
 56     int *hInput  = NULL;
 57     int *hOutput = NULL;
 58     int *dInput  = NULL;
 59     int *dOutput = NULL;
 60 
 61     printf("%s starting...\n", sampleName);
 62 
 63     int deviceCount;
 64     cudaGetDeviceCount(&deviceCount);
 65     printf("DevicecheckCudaErrors Count: %d\n", deviceCount);
 66     int deviceID = findCudaDevice(argc, argv);
 67     cudaDeviceProp prop;
 68     cudaGetDeviceProperties(&prop, deviceID);
 69     if (prop.major < 2)    
 70     {
 71         printf("ERROR: cppOverload requires GPU devices with compute SM 2.0 or higher.\n");
 72         printf("Current GPU device has compute SM%d.%d, Exiting...", prop.major, prop.minor);
 73         exit(EXIT_WAIVED);
 74     }
 75     cudaSetDevice(deviceID);
 76 
 77     cudaMalloc(&dInput , sizeof(int)*N*2);
 78     cudaMalloc(&dOutput, sizeof(int)*N);
 79     cudaMallocHost(&hInput , sizeof(int)*N*2);
 80     cudaMallocHost(&hOutput, sizeof(int)*N);
 81     for (int i = 0; i < N*2; i++)
 82         hInput[i] = i;
 83     cudaMemcpy(dInput, hInput, sizeof(int)*N*2, cudaMemcpyHostToDevice);
 84 
 85     bool testResult = true;
 86     bool funcResult = true;
 87     int a = 1;
 88 
 89     void (*func1)(const int *, int *, int);
 90     void (*func2)(const int2 *, int *, int);
 91     void (*func3)(const int *, const int *, int *, int);
 92     struct cudaFuncAttributes attr;
 93 
 94     // overload function 1
 95     func1 = simple_kernel;
 96     memset(&attr, 0, sizeof(attr));
 97     cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared);
 98     cudaFuncGetAttributes(&attr, *func1);
 99     OUTPUT_ATTR(attr);
100     (*func1)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dOutput, a);
101     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
102     funcResult = check_func1(hInput, hOutput, a);
103     printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED");
104     testResult &= funcResult;
105 
106     // overload function 2
107     func2 = simple_kernel;
108     memset(&attr, 0, sizeof(attr));
109     cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);
110     cudaFuncGetAttributes(&attr, *func2);
111     OUTPUT_ATTR(attr);
112     (*func2)<<<DIV_UP(N, THREAD_N), THREAD_N>>>((int2 *)dInput, dOutput, a);
113     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
114     funcResult = check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a);
115     printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED");
116     testResult &= funcResult;
117 
118     // overload function 3
119     func3 = simple_kernel;
120     memset(&attr, 0, sizeof(attr));
121     cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);
122     cudaFuncGetAttributes(&attr, *func3);
123     OUTPUT_ATTR(attr);
124     (*func3)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dInput+N, dOutput, a);
125     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);
126     funcResult = check_func3(&hInput[0], &hInput[N], hOutput, a);
127     printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", funcResult ? "PASSED" : "FAILED");
128     testResult &= funcResult;
129 
130     cudaFree(dInput);
131     cudaFree(dOutput);
132     cudaFreeHost(hOutput);
133     cudaFreeHost(hInput);
134 
135     cudaDeviceSynchronize();
136 
137     getchar();
138     exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
139 }

 

? 输出结果:

C++ Function Overloading starting...
DevicecheckCudaErrors Count: 1
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1

Shared Size:   1024
Constant Size: 0
Local Size:    0
Max Threads Per Block: 1024
Number of Registers: 13
PTX Version: 50
Binary Version: 61
simple_kernel(const int *pIn, int *pOut, int a) PASSED

Shared Size:   2048
Constant Size: 0
Local Size:    0
Max Threads Per Block: 1024
Number of Registers: 13
PTX Version: 50
Binary Version: 61
simple_kernel(const int2 *pIn, int *pOut, int a) PASSED

Shared Size:   2048
Constant Size: 0
Local Size:    0
Max Threads Per Block: 1024
Number of Registers: 15
PTX Version: 50
Binary Version: 61
simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

 

? 涨姿势:

● 使用扩展名为 .cuh 的头文件

● cuda内置结构 cudaFuncAttributes 的定义:

 1 struct __device_builtin__ cudaFuncAttributes
 2 {
 3     size_t sharedSizeBytes; // 共享内存大小
 4     size_t constSizeBytees; // 常量内存大小
 5     size_t localSizeBytes;  // 局部内存大小
 6     int maxThreadsPerBlock; // 每线程块线最大程数量
 7     int numRegs;            // 寄存器数量
 8     int ptxVersion;         // PTX版本号
 9     int binaryVersion;      // 机器码版本号
10     int cacheModeCA;        // 是否使用编译指令 -Xptxas --dlcm=ca
11 };

 

● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

1 struct cudaFuncAttributes attr;
2 memset(&attr, 0, sizeof(attr));
3 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);
4 cudaFuncGetAttributes(&attr, *function);

涉及的函数

 1 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
 2 
 3 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
 4 {
 5     return cudaErrorUnknown;
 6 }
 7 
 8 #define OUTPUT_ATTR(attr)                                            9     printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);       10     printf("Constant Size: %d\n", (int)attr.constSizeBytes);        11     printf("Local Size:    %d\n", (int)attr.localSizeBytes);        12     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); 13     printf("Number of Registers: %d\n", attr.numRegs);              14     printf("PTX Version: %d\n", attr.ptxVersion);                   15     printf("Binary Version: %d\n", attr.binaryVersion);

 

以上是关于0_Simple__cppOverload的主要内容,如果未能解决你的问题,请参考以下文章

0_Simple__asyncAPI

0_Simple__cppIntegration

0_Simple__cudaOpenMP

0_Simple__simpleCallback

0_Simple__cdpSimpleQuicksort

0_Simple__clock