第六章 常量内存
Posted 爨爨爨好
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第六章 常量内存相关的知识,希望对你有一定的参考价值。
本章介绍了常量内存的使用,并给光线追踪的一个例子。
章节代码
1 #include <stdio.h> 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include "D:\Code\CUDA\book\common\book.h" 5 #include "D:\Code\CUDA\book\common\cpu_bitmap.h" 6 7 #define DIM 1024 8 #define rnd( x ) (x * rand() / RAND_MAX) 9 #define INF 2e10f 10 #define SPHERES 40 11 #define USE_CONSTANT_MEMORY false 12 13 struct Sphere 14 { 15 float r, b, g; 16 float radius; 17 float x, y, z; 18 __device__ float hit(float ox, float oy, float *n)//计算球体上一点处的法向量n并返回该点到画面平面的距离 19 { 20 float dx = ox - x; 21 float dy = oy - y; 22 if (dx*dx + dy*dy < radius*radius) 23 { 24 float dz = sqrtf(radius*radius - dx*dx - dy*dy); 25 *n = dz / sqrtf(radius * radius);//球上该点法向量与画面法向量夹角的余弦值 26 return dz + z; 27 } 28 return -INF; 29 } 30 }; 31 32 #if USE_CONSTANT_MEMORY 33 __constant__ Sphere s[SPHERES]; 34 struct DataBlock 35 { 36 unsigned char *dev_bitmap; 37 }; 38 #else 39 struct DataBlock 40 { 41 unsigned char *dev_bitmap; 42 Sphere *s; 43 }; 44 #endif 45 46 #if USE_CONSTANT_MEMORY 47 __global__ void kernel(unsigned char *ptr) 48 #else 49 __global__ void kernel(Sphere *s, unsigned char *ptr) 50 #endif 51 { 52 int x = threadIdx.x + blockIdx.x * blockDim.x; 53 int y = threadIdx.y + blockIdx.y * blockDim.y; 54 int offset = x + y * blockDim.x * gridDim.x; 55 float ox = (x - DIM / 2); 56 float oy = (y - DIM / 2); 57 58 float r = 0, g = 0, b = 0;//计算该像素应该显示什么颜色 59 float maxz = -INF; 60 for (int i = 0; i<SPHERES; i++) 61 { 62 float n; 63 float t = s[i].hit(ox, oy, &n);//o理解成"observation",即当前线程代表的坐标 64 if (t > maxz)//距离画面最近,更新该像素的显示 65 { 66 float fscale = n; 67 r = s[i].r * fscale; 68 g = s[i].g * fscale; 69 b = s[i].b * fscale; 70 maxz = t; 71 } 72 } 73 74 ptr[offset * 4 + 0] = (int)(r * 255); 75 ptr[offset * 4 + 1] = (int)(g * 255); 76 ptr[offset * 4 + 2] = (int)(b * 255); 77 ptr[offset * 4 + 3] = 255; 78 79 return; 80 } 81 82 int main(void) 83 { 84 DataBlock data; 85 cudaEvent_t start, stop;// 计时器 86 cudaEventCreate(&start); 87 cudaEventCreate(&stop); 88 cudaEventRecord(start, 0); 89 90 CPUBitmap bitmap(DIM, DIM, &data); 91 unsigned char *dev_bitmap; 92 #if !USE_CONSTANT_MEMORY 93 Sphere *s; 94 #endif 95 cudaMalloc((void**)&dev_bitmap,bitmap.image_size()); 96 cudaMalloc((void**)&s,sizeof(Sphere) * SPHERES); 97 Sphere *temp_s = (Sphere*)malloc(sizeof(Sphere) * SPHERES); 98 99 for (int i = 0; i<SPHERES; i++) 100 { 101 temp_s[i].r = rnd(1.0f); 102 temp_s[i].g = rnd(1.0f); 103 temp_s[i].b = rnd(1.0f); 104 temp_s[i].x = rnd(1000.0f) - 500; 105 temp_s[i].y = rnd(1000.0f) - 500; 106 temp_s[i].z = rnd(1000.0f) - 500; 107 temp_s[i].radius = rnd(100.0f) + 20; 108 } 109 110 #if USE_CONSTANT_MEMORY 111 cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere) * SPHERES); 112 kernel << < dim3(DIM / 16, DIM / 16), dim3(16, 16) >> > (dev_bitmap); 113 #else 114 cudaMemcpy(s, temp_s, sizeof(Sphere) * SPHERES, cudaMemcpyHostToDevice); 115 kernel << < dim3(DIM / 16, DIM / 16), dim3(16, 16) >> > (s, dev_bitmap); 116 #endif 117 118 cudaMemcpy(bitmap.get_ptr(), dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost); 119 120 cudaEventRecord(stop, 0);//测量计算耗时 121 cudaEventSynchronize(stop); 122 float elapsedTime; 123 cudaEventElapsedTime(&elapsedTime,start, stop); 124 printf("Time to generate: %3.1f ms\n", elapsedTime); 125 cudaEventDestroy(start); 126 cudaEventDestroy(stop); 127 128 free(temp_s); 129 cudaFree(dev_bitmap); 130 cudaFree(s); 131 132 bitmap.display_and_exit(); 133 134 getchar(); 135 return; 136 }
? 算法大意:有SPHERES个球分布在原点附近,其球心坐标在每个坐标轴方向上分量绝对值不大于500,其半径介于20到120;观察者(画面平面)位于z正半轴充分远处(z>500),现将所有的球体平行投影到画面平面上,考虑遮挡关系,并考虑球面与画面平面的夹角给球体绘制阴影。
? 使用常量内存时球数组定义在所有函数外部,核函数只需图形参数就够了;不使用常量内存时球数组定义在结构DataBlock内部,核函数需要球数组和图形参数
? 使用了结构cudaEvent_t用于计时,并介绍了与此相关的时间控制函数,按顺序使用如下。
1 cudaEvent_t start, stop; 2 cudaEventCreate(&start); 3 cudaEventRecord(start, 0); 4 5 //Do something 6 7 cudaEventCreate(&stop); 8 cudaEventRecord(stop, 0); 9 cudaEventSynchronize(stop); 10 11 float elapsedTime; 12 cudaEventElapsedTime(&elapsedTime, start, stop); 13 cudaEventDestroy(start); 14 cudaEventDestroy(stop);
? 使用cudaMemcpyToSymbol()函数复制内存到到常量内存中(可以反向从显存复制到内存中,但由于只读一般没有情况会这样做)。其定义于cuda_runtime.h中
1 template<class T> 2 static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( 3 const T &symbol, 4 const void *src, 5 size_t count, 6 size_t offset = 0, 7 enum cudaMemcpyKind kind = cudaMemcpyHostToDevice) 8 { 9 return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind); 10 }
以上是关于第六章 常量内存的主要内容,如果未能解决你的问题,请参考以下文章