CUDA C Programming Guide 在线教程学习笔记 Part 2

Posted 2020-10-14 爨爨爨好

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了CUDA C Programming Guide 在线教程学习笔记 Part 2相关的知识，希望对你有一定的参考价值。

▶ 纹理内存使用

● 纹理内存使用有两套 API，称为 Object API 和 Reference API 。纹理对象（texture object）在运行时被 Object API 创建，同时指定了纹理单元。纹理引用（Tezture Reference）在编译时被 Reference API 创建，但是在运行时才指定纹理单元，并将纹理引用绑定到纹理单元上面去。

● 不同的纹理引用可能绑定到相同或内存上有重叠的的纹理单元上，纹理单元可能是 CUDA 线性内存或CUDA array 的任意部分。

● 可以定义一维到三维的数组作为纹理内存。数组中的元素简称 texel （texture element）。

● 纹理内存的数据类型可以是 char, int, long, long long, float, double，即所有基本的占用1、2、4字节的数据类型。
● 纹理内存的访问模式有 cudaReadModeNormalizedFloat 和 cudaReadModeElementType 两种。前者读取 4 字节整数时会除以 0x8fff（有符号整数）或 0xffff（无符号整数），从而把值线性映射到 [-1.0, 1.0] 区间（有符号整数）或 [0, 1] 区间（无符号整数），读取 2 字节整数时也会发生类似变换，除以 0x8f 或 0xff 。后者则不会发生这种转换。

● 纹理数组使用浮点坐标进行引用。长为 N 的一维数组，默认纹理坐标中，下标范围是 [0.0, N-1] 之间的浮点数；正规化纹理坐标中，下标范围是 [0.0, 1-1/N] 之间的浮点数。二维或三维数组每一维上的坐标也遵循这个原则。

● 寻址模式，可以对数组范围以外的坐标进行访问（越界访问），不同的寻址模式定义了这种操作的效果。默认寻址模式 cudaAddressModeClamp 下，越界访问取各维上的边界值；边界模式 cudaAddressModeBorder 下，越界访问会返回 0 。使用正规化坐标时，可以选用束模式和镜像模式，束模式 cudaAddressModeWrap（想象成左右边界相连）下，越界坐标做变换 x\' = x - floor(x) （教程上少了减号）；镜像模式 cudaAddressModeMirror（想象成左 ~ 右 → 右 ~ 左 → 左 ~ 右）下，越界坐标做变换 x\' = x（floor(x) 为偶数）或 x\' = 1 - x（floor(x) 为奇数）。

● 滤波模式，决定了如何把整数坐标的数组数据值转化为浮点坐标的引用值。最临近插值 cudaFilterModePoint 使用最接近访问坐标的整数坐标点数据，可以返回整数值（若纹理数组本身是整数型）；线性插值 cudaFilterModeLinear 使用每维度上最接近访问坐标的两个整数坐标点数据进行插值，可以单线性（一维，2 点）、双线性（二维，4 点）和三线性（三维，8 点），只能返回浮点数值。

● 使用 Texture Object API 。

■ 涉及的结构定义、接口函数。

  1 // texture_types.h
  2 struct __device_builtin__ cudaTextureDesc
  3 {
  4     enum cudaTextureAddressMode addressMode[3];     // 寻址模式，cudaResourceDesc::resType == cudaResourceTypeLinear 时无效
  5     enum cudaTextureFilterMode  filterMode;         // 滤波模式，cudaResourceDesc::resType == cudaResourceTypeLinear 时无效
  6     enum cudaTextureReadMode    readMode;           // 访问模式
  7     int                         sRGB;               // ？读取时将sRGB范围正规化
  8     float                       borderColor[4];     // ？文理边界颜色
  9     int                         normalizedCoords;   // 是否使用正规化坐标
 10     unsigned int                maxAnisotropy;      // ？
 11     enum cudaTextureFilterMode  mipmapFilterMode;   // ？
 12     float                       mipmapLevelBias;    // ？
 13     float                       minMipmapLevelClamp;// ？
 14     float                       maxMipmapLevelClamp;// ？
 15 };
 16 
 17 enum __device_builtin__ cudaTextureAddressMode
 18 {
 19     cudaAddressModeWrap = 0,
 20     cudaAddressModeClamp = 1,
 21     cudaAddressModeMirror = 2,
 22     cudaAddressModeBorder = 3
 23 };
 24 
 25 enum __device_builtin__ cudaTextureFilterMode
 26 {
 27     cudaFilterModePoint = 0,
 28     cudaFilterModeLinear = 1
 29 };
 30 
 31 enum __device_builtin__ cudaTextureReadMode
 32 {
 33     cudaReadModeElementType = 0,  
 34     cudaReadModeNormalizedFloat = 1
 35 };
 36 
 37 typedef __device_builtin__ unsigned long long cudaTextureObject_t;
 38 
 39 // driver_types.h
 40 enum __device_builtin__ cudaChannelFormatKind
 41 {
 42     cudaChannelFormatKindSigned = 0,    // 有符号整数模式
 43     cudaChannelFormatKindUnsigned = 1,  // 无符号整数模式
 44     cudaChannelFormatKindFloat = 2,     // 浮点模式
 45     cudaChannelFormatKindNone = 3       // 无通道模式
 46 };
 47 
 48 struct __device_builtin__ cudaChannelFormatDesc
 49 {
 50     int                        x;   // 通道 0 数据位深度
 51     int                        y;   // 通道 1 数据位深度
 52     int                        z;   // 通道 2 数据位深度
 53     int                        w;   // ？
 54     enum cudaChannelFormatKind f;   // 通道模式
 55 };
 56 
 57 typedef struct cudaArray *cudaArray_t;
 58 typedef struct cudaMipmappedArray *cudaMipmappedArray_t;
 59 
 60 enum __device_builtin__ cudaResourceType
 61 {
 62     cudaResourceTypeArray = 0x00,           // 数组资源
 63     cudaResourceTypeMipmappedArray = 0x01,  // 映射数组资源
 64     cudaResourceTypeLinear = 0x02,          // 线性资源
 65     cudaResourceTypePitch2D = 0x03          // 对齐二维资源
 66 };
 67 
 68 struct __device_builtin__ cudaResourceDesc
 69 {
 70     enum cudaResourceType resType;              // 资源类型
 71 
 72     union res
 73     {
 74         struct array                            // cuda数组
 75         {
 76             cudaArray_t array;                  
 77         };
 78         struct mipmap                           // mipmap 数组
 79         {
 80             cudaMipmappedArray_t mipmap;        
 81         };
 82         struct linear                           // 一维数组
 83         {
 84             void *devPtr;                       // 设备指针，符合 cudaDeviceProp::textureAlignment 的对齐要求
 85             struct cudaChannelFormatDesc desc;  // texel 的属性描述
 86             size_t sizeInBytes;                 // 数组字节数
 87         };
 88         struct pitch2D                          // 二位数组
 89         {
 90             void *devPtr;                       // 设备指针，符合 cudaDeviceProp::textureAlignment 的对齐要求
 91             struct cudaChannelFormatDesc desc;  // texel 的属性描述
 92             size_t width;                       // 数组列数
 93             size_t height;                      // 数组行数
 94             size_t pitchInBytes;                // 数组行字节数
 95         };
 96     };
 97 };
 98 
 99 // cuda_runtime_api.h
100 
101 extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
102 
103 extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));
104 
105 extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
106 
107 extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
108 
109 extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject);

■ 完整的应用样例代码。初始化一个 32×32 的矩阵，利用纹理对其进行平移和旋转，输出调整之后的矩阵。

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <malloc.h>
  4 #include <cuda_runtime_api.h>
  5 #include "device_launch_parameters.h"
  6 
  7 #define DEGRE_TO_RADIAN(x) ((x) * 3.1416f / 180)
  8 #define CEIL(x,y) (((x) + (y) - 1) / (y) + 1)
  9 
 10 // 简单的线性变换
 11 __global__ void transformKernel(float* output, cudaTextureObject_t texObj, int width, int height, float theta)
 12 {
 13     // 计算正规化纹理坐标
 14     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 15     unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
 16     
 17     // 正规化和平移
 18     float u = idx / (float)width - 0.5f;
 19     float v = idy / (float)height - 0.5f;
 20     
 21     // 旋转
 22     float tu = u * __cosf(theta) - v * __sinf(theta) + 0.5f;
 23     float tv = v * __cosf(theta) + u * __sinf(theta) + 0.5f;
 24 
 25     //printf("\\n(%2d,%2d,%2d,%2d)->(%f,%f,%f)", 
 26     //    blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, tu, tv,tex2D<float>(texObj, tu, tv));
 27 
 28     // 纹理内存写入全局内存
 29     output[idy * width + idx] = tex2D<float>(texObj, tu, tv);
 30 }
 31 
 32 int main()
 33 {
 34     // 基本数据
 35     int i;
 36     float *h_data, *d_data;
 37     int width = 32;
 38     int height = 32;
 39     float angle = DEGRE_TO_RADIAN(30);
 40 
 41     int size = sizeof(float)*width*height;
 42     h_data = (float *)malloc(size);
 43     cudaMalloc((void **)&d_data, size);
 44     
 45     for (i = 0; i < width*height; i++)
 46         h_data[i] = (float)i;
 47 
 48     printf("\\n\\n");
 49     for (i = 0; i < width*height; i++)
 50     {
 51         printf("%6.1f ", h_data[i]);
 52         if ((i + 1) % width == 0)
 53             printf("\\n");
 54     }
 55 
 56     // 申请 cuda 数组并拷贝数据
 57     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
 58     cudaArray* cuArray;
 59     cudaMallocArray(&cuArray, &channelDesc, width, height);
 60     cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);
 61 
 62     // 指定纹理资源
 63     struct cudaResourceDesc resDesc;
 64     memset(&resDesc, 0, sizeof(resDesc));
 65     resDesc.resType = cudaResourceTypeArray;
 66     resDesc.res.array.array = cuArray;
 67 
 68     // 指定纹理对象参数
 69     struct cudaTextureDesc texDesc;
 70     memset(&texDesc, 0, sizeof(texDesc));
 71     texDesc.addressMode[0] = cudaAddressModeWrap;
 72     texDesc.addressMode[1] = cudaAddressModeWrap;
 73     texDesc.filterMode = cudaFilterModeLinear;
 74     texDesc.readMode = cudaReadModeElementType;
 75     texDesc.normalizedCoords = 1;
 76 
 77     // 创建文理对象
 78     cudaTextureObject_t texObj = 0;
 79     cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
 80 
 81     // 运行核函数
 82     dim3 dimBlock(16, 16);
 83     dim3 dimGrid(CEIL(width, dimBlock.x), CEIL(height, dimBlock.y));
 84     transformKernel << <dimGrid, dimBlock >> > (d_data, texObj, width, height, angle); 
 85     cudaDeviceSynchronize();
 86 
 87     // 结果回收和检查结果
 88     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
 89 
 90     printf("\\n\\n");
 91     for (i = 0; i < width*height; i++)
 92     {
 93         printf("%6.1f ", h_data[i]);
 94         if ((i + 1) % width == 0)
 95             printf("\\n");
 96     }
 97     
 98     // 回收工作
 99     cudaDestroyTextureObject(texObj);
100     cudaFreeArray(cuArray);
101     cudaFree(d_data);
102 
103     getchar();
104     return 0;
105 }

● 使用 Texture Reference API。

■ 纹理引用的一些只读属性需要在声明的时候指定，以便编译时提前确定，只能在全局作用域内静态指定，不能作为参数传递给函数。使用 texture 指定纹理引用属性，Datatype 为 texel 的数据类型，Type 为纹理引用类型，有 7 种，默认 cudaTextureType1D，ReadMode 为访问类型，默认 cudaReadModeElementType ，其他属性可以在主机运行时动态的修改。

 1 texture<DataType, Type, ReadMode> texRef;
 2 
 3 // cuda_texture_types.h
 4 template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
 5 struct __device_builtin_texture_type__ texture : public textureReference
 6 {
 7 #if !defined(__CUDACC_RTC__)
 8     __host__ texture(int norm = 0, enum cudaTextureFilterMode  fMode = cudaFilterModePoint, enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
 9     {
10         normalized = norm;
11         filterMode = fMode;
12         addressMode[0] = aMode;
13         addressMode[1] = aMode;
14         addressMode[2] = aMode;
15         channelDesc = cudaCreateChannelDesc<T>();
16         sRGB = 0;
17     }
18     __host__ texture(int norm, enum cudaTextureFilterMode   fMode, enum cudaTextureAddressMode  aMode, struct cudaChannelFormatDesc desc)
19     {
20         normalized = norm;
21         filterMode = fMode;
22         addressMode[0] = aMode;
23         addressMode[1] = aMode;
24         addressMode[2] = aMode;
25         channelDesc = desc;
26         sRGB = 0;
27     }
28 #endif
29 };
30 
31 //texture_types.h
32 #define cudaTextureType1D              0x01
33 #define cudaTextureType2D              0x02
34 #define cudaTextureType3D              0x03
35 #define cudaTextureTypeCubemap         0x0C
36 #define cudaTextureType1DLayered       0xF1
37 #define cudaTextureType2DLayered       0xF2
38 #define cudaTextureTypeCubemapLayered  0xFC

■ 涉及的结构定义、接口函数。纹理引用必须用函数 cudaBindTexture() 或 cudaBindTexture2D() 或 cudaBindTextureToArray() 绑定到相应维度的数组上才能使用，要求纹理引用的维度、数据类型与该数组匹配，否则操作时未定义的，使用完后还要用函数 cudaUnbindTexture() 解除绑定。

 1 // texture_types.h
 2 struct __device_builtin__ textureReference
 3 {
 4     int                          normalized;            // 是否使用正规化坐标
 5     enum cudaTextureFilterMode   filterMode;            // 滤波模式
 6     enum cudaTextureAddressMode  addressMode[3];        // 寻址模式
 7     struct cudaChannelFormatDesc channelDesc;           // texel 的格式，其元素数据类型与声明 texture 时的 Datatype 一致
 8     int                          sRGB;                  // ？读取时将sRGB范围正规化
 9     unsigned int                 maxAnisotropy;         // ？
10     enum cudaTextureFilterMode   mipmapFilterMode;      // ？
11     float                        mipmapLevelBias;       // ？
12     float                        minMipmapLevelClamp;   // ？
13     float                        maxMipmapLevelClamp;   // ？
14     int                          __cudaReserved[15];    // ？
15 };
16 
17 // cuda_runtime_api.h
18 extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX));
19 
20 extern __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch);
21 
22 extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
23 
24 extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref);
25 
26 extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol);

■ 将 2D 纹理引用绑定到 2D 数组上的范例代码

 1 // 准备工作
 2 texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
 3 
 4 ...
 5     
 6 int width, height;
 7 size_t pitch;
 8 float *d_data;
 9 cudaMallocPitch((void **)&d_data, &pitch, sizeof(float)*width, height);
10 
11 // 第一种方法，低层 API
12 textureReference* texRefPtr;
13 cudaGetTextureReference(&texRefPtr, &texRef);
14 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
15 size_t offset;
16 cudaBindTexture2D(&offset, texRefPtr, d_data, &channelDesc, width, height, pitch);
17 
18 // 第二种方法，高层 API
19 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
20 size_t offset;
21 cudaBindTexture2D(&offset, texRef, d_data, channelDesc, width, height, pitch);

■ 将 2D 纹理引用绑定到 cuda 数组上的范例代码

 1 // 准备工作
 2 texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
 3 
 4 //...
 5 
 6 cudaArray* cuArray;
 7 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 8 cudaMallocArray(&cuArray, &channelDesc, width, height);
 9 
10 // 第一种方法，低层 API
11 textureReference* texRefPtr;
12 cudaGetTextureReference(&texRefPtr, &texRef);
13 memset(&channelDesc, 0, sizeof(cudaChannelFormatDesc));
14 cudaChannelFormatDesc channelDesc;
15 cudaGetChannelDesc(&channelDesc, cuArray);
16 cudaBindTextureToArray(texRef, cuArray, &channelDesc);
17 
18 // 第二种方法，高层 API
19 cudaBindTextureToArray(texRef, cuArray);

■ 完整的应用样例代码。与前面纹理对象代码的功能相同。

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <malloc.h>
 4 #include <cuda_runtime_api.h>
 5 #include "device_launch_parameters.h"
 6 
 7 #define DEGRE_TO_RADIAN(x) ((x) * 3.1416f / 180)
 8 #define CEIL(x,y) (((x) + (y) - 1) / (y) + 1)
 9 
10 // 声明纹理引用
11 texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
12 
13 // 简单的线性变换
14 __global__ void transformKernel(float* output, int width, int height, float theta)
15 {
16     // 计算正规化纹理坐标
17     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
18     unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
19     
20     // 正规化和平移
21     float u = idx / (float)width;
22     float v = idy / (float)height;
23     
24     // 旋转
25     float tu = u * __cosf(theta) - v * __sinf(theta) + 0.5f;
26     float tv = v * __cosf(theta) + u * __sinf(theta) + 0.5f;
27 
28     //printf("\\n(%2d,%2d,%2d,%2d)->(%f,%f,%f)", 
29     //    blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, tu, tv,tex2D<float>(texObj, tu, tv));
30 
31     // 纹理内存写入全局内存
32     output[idy * width + idx] = tex2D(texRef, tu, tv);
33 }
以上是关于CUDA C Programming Guide 在线教程学习笔记 Part 2的主要内容，如果未能解决你的问题，请参考以下文章