OpenCL 双调排序 GPU 版
Posted cuancuancuanhao
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了OpenCL 双调排序 GPU 版相关的知识,希望对你有一定的参考价值。
? 参考书中的代码,写了
● 代码,核函数文件包含三中算法
1 // kernel.cl 2 __kernel void bitonicSort01(__global uint *data, const uint stage, const uint subStage, const uint direction)// 基本的元素对调整 3 { 4 const uint gid = get_global_id(0); 5 const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction; // 判断本工作项的元素对应该排成升序还是降序 6 const uint distance = 1 << (stage - subStage); // 元素对下标差 7 const uint lid = (gid / distance) * distance * 2 + gid % distance; // 寻找元素对的左右元素 8 const uint rid = lid + distance; 9 const uint lElement = data[lid], rElement = data[rid]; 10 if (lElement > rElement && isAscend || lElement < rElement && !isAscend) // 不符合排序要求,交换两元素 11 data[lid] = rElement, data[rid] = lElement; 12 } 13 14 __kernel void bitonicSort02(__global uint *data, const uint stage, const uint subStage, const uint direction, __local uint *localMem)// 使用局部内存调整 15 { 16 const uint gid = get_global_id(0), mid = get_local_id(0); // 同 bitonicSort01 17 const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction; 18 const uint distance = 1 << (stage - subStage); 19 const uint lid = (gid / distance) * distance * 2 + gid % distance; 20 const uint rid = lid + distance; 21 22 localMem[mid * 2 + 0] = data[lid]; // 读取 data 的时候读进局部内存,与局部内存相关的下标用的都是 mid 而不是 gid 23 localMem[mid * 2 + 1] = data[rid]; 24 barrier(CLK_LOCAL_MEM_FENCE); 25 26 if (localMem[mid * 2 + 0] > localMem[mid * 2 + 1] && isAscend || localMem[mid * 2 + 0] < localMem[mid * 2 + 1] && !isAscend) 27 data[lid] = localMem[mid * 2 + 1], data[rid] = localMem[mid * 2 + 0]; 28 } 29 30 #define STRIDE 4 // aux 中四个元素一组,表示一个工作项的元素对索引和值,依照 main.c 中给定的第 5 参数的大小进行相等的调整 31 32 __kernel void bitonicSort03(__global uint *data, const uint stage, const uint subStage, const uint direction, __local uint *localMem, __local uint *aux)// 使用两个局部内存,感觉多此一举? 33 { 34 const uint gid = get_global_id(0), mid = get_local_id(0); // 同 bitonicSort02 35 const uint isAscend = ((gid / (1 << stage)) % 2) ? 1 - direction : direction; 36 const uint distance = 1 << (stage - subStage); 37 const uint lid = (gid / distance) * distance * 2 + gid % distance; 38 const uint rid = lid + distance; 39 40 localMem[mid * 2 + 0] = data[lid]; 41 localMem[mid * 2 + 1] = data[rid]; 42 barrier(CLK_LOCAL_MEM_FENCE); 43 44 aux[mid * STRIDE + 0] = lid; // 开始向aux 中填充 45 aux[mid * STRIDE + 2] = rid; 46 if (localMem[mid * 2 + 0] > localMem[mid * 2 + 1] && isAscend || localMem[mid * 2 + 0] < localMem[mid * 2 + 1] && !isAscend) 47 aux[mid * STRIDE + 1] = localMem[mid * 2 + 1], aux[mid * STRIDE + 3] = localMem[mid * 2 + 0]; 48 else 49 aux[mid * STRIDE + 1] = localMem[mid * 2 + 0], aux[mid * STRIDE + 3] = localMem[mid * 2 + 1]; 50 barrier(CLK_LOCAL_MEM_FENCE); 51 52 data[aux[mid * STRIDE + 0]] = aux[mid * STRIDE + 1], data[aux[mid * STRIDE + 2]] = aux[mid * STRIDE + 3]; // 向原数组中填充数据 53 /*// 书中的填充方法,一个工作组仅使用一个工作项串行地向原数组中填充,美名曰“无冲突”,实际上花费了 5 倍的时间 54 if (mid == 0) 55 { 56 for (int i = 0; i < get_local_size(0); i++) 57 data[aux[i * STRIDE + 0]] = aux[i * STRIDE + 1], data[aux[i * STRIDE + 2]] = aux[i * STRIDE + 3]; 58 } 59 */ 60 }
1 // main.c 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <cl.h> 5 6 //#define PRINT_RESULT // 输出排序前后的数组元素(数据量较大时不用) 7 #define ASCENDING 1 // 升序 8 #define DESCENDING 0 // 降序 9 #define DATA_SIZE (1<<20) // 数据规模 10 #define GROUP_SIZE 128 // 工作组大小 11 12 const char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; 13 const unsigned int sortOrder = ASCENDING; // ASCENDING / DESCENDING 14 15 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度 16 { 17 FILE *fp; 18 int size; 19 //printf("<readText> File: %s\n", kernelPath); 20 fopen_s(&fp, kernelPath, "rb"); 21 if (!fp) 22 { 23 printf("Open kernel file failed\n"); 24 getchar(); 25 exit(-1); 26 } 27 if (fseek(fp, 0, SEEK_END) != 0) 28 { 29 printf("Seek end of file failed\n"); 30 getchar(); 31 exit(-1); 32 } 33 if ((size = ftell(fp)) < 0) 34 { 35 printf("Get file position failed\n"); 36 getchar(); 37 exit(-1); 38 } 39 rewind(fp); 40 if ((*pcode = (char *)malloc(size + 1)) == NULL) 41 { 42 printf("Allocate space failed\n"); 43 getchar(); 44 exit(-1); 45 } 46 fread(*pcode, 1, size, fp); 47 (*pcode)[size] = ‘\0‘; 48 fclose(fp); 49 return size + 1; 50 } 51 52 int main() 53 { 54 int i; 55 srand(97); 56 57 cl_int status; 58 cl_uint nPlatform; 59 clGetPlatformIDs(0, NULL, &nPlatform); 60 cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id)); 61 clGetPlatformIDs(nPlatform, listPlatform, NULL); 62 cl_uint nDevice; 63 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice); 64 cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id)); 65 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL); 66 cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); 67 cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], CL_QUEUE_PROFILING_ENABLE, &status); 68 69 int *hostA = (cl_int*)malloc(DATA_SIZE * sizeof(cl_int)); 70 for (i = 0; i < DATA_SIZE; hostA[i++] = rand()); 71 #ifdef PRINT_RESULT // 输出排序前的数组 72 printf("\n"); 73 for (i = 0; i < DATA_SIZE; i++) 74 { 75 printf("%5d,", hostA[i]); 76 if ((i + 1) % 16 == 0) 77 printf("\n"); 78 } 79 printf("\n"); 80 #endif 81 cl_mem deviceA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, DATA_SIZE * sizeof(cl_int), hostA, &status); 82 83 char *code; 84 size_t codeLength = readText(sourceText, &code); 85 cl_program program = clCreateProgramWithSource(context, 1, (const char**)&code, &codeLength, &status); 86 status = clBuildProgram(program, nDevice, listDevice, NULL, NULL, NULL); 87 if (status) 88 { 89 char info[10000]; 90 clGetProgramBuildInfo(program, listDevice[0], CL_PROGRAM_BUILD_LOG, 10000, info, NULL); 91 printf("\n%s\n", info); 92 } 93 //cl_kernel kernel = clCreateKernel(program, "bitonicSort01", &status); // 选择使用的核函数 94 //cl_kernel kernel = clCreateKernel(program, "bitonicSort02", &status); 95 cl_kernel kernel = clCreateKernel(program, "bitonicSort03", &status); 96 97 size_t globalSize = DATA_SIZE / 2, groupSize = GROUP_SIZE; 98 cl_uint stageCount, stage, subStage; 99 for (i = DATA_SIZE, stageCount = 0; i > 1; stageCount++, i >>= 1); // 需要的总轮数 100 clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&deviceA); 101 clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*)&sortOrder); 102 clSetKernelArg(kernel, 4, GROUP_SIZE * 2 * sizeof(cl_uint), NULL); // bitonicSort02 和 bitonicSort03 需要的局部内存参数 103 clSetKernelArg(kernel, 5, GROUP_SIZE * 4 * sizeof(cl_uint), NULL); // bitonicSort03 需要的局部内存参数 104 105 cl_event exeEvent; // 计时用的事件 106 cl_ulong executionStart, executionEnd, timeCount1, timeCount2; // 计时器, 精确到 ns 107 for (stage = timeCount1 = timeCount2 = 0; stage < stageCount; stage++) // 当前轮数 108 { 109 clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*)&stage); 110 for (subStage = 0; subStage < stage + 1; subStage++) // 当前轮中归并的步骤数 111 { 112 clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*)&subStage); 113 //clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &groupSize, 0, NULL, NULL); // 不计时的核函数调用 114 clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &groupSize, 0, NULL, &exeEvent); 115 clWaitForEvents(1, &exeEvent); 116 clGetEventProfilingInfo(exeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &executionStart, NULL); 117 clGetEventProfilingInfo(exeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &executionEnd, NULL); 118 timeCount1 += (executionEnd - executionStart) / 1000000, timeCount2 += (executionEnd - executionStart) % 1000000;// ms 的整数和小数部分分开记录 119 } 120 } 121 printf("Kernel time: %lu.%lu ms\n", timeCount1 + timeCount2 / 1000000, timeCount2 % 1000000); // 内核运行总时间 122 123 clEnqueueReadBuffer(queue, deviceA, CL_TRUE, 0, DATA_SIZE * sizeof(cl_int), hostA, 0, NULL, NULL); 124 #ifdef PRINT_RESULT // 输出排序后的结果 125 printf("\n"); 126 for (i = 0; i < DATA_SIZE; i++) 127 { 128 printf("%5d,", hostA[i]); 129 if ((i + 1) % 16 == 0) 130 printf("\n"); 131 } 132 printf("\n"); 133 #else 134 for (i = 0; i < DATA_SIZE - 1; i++) // 不输出结果,只做检查,有错误的时候输出第一个错误的位置 135 { 136 if (sortOrder != (hostA[i + 1] >= hostA[i])) 137 { 138 printf("Error at i == %d, hostA[i] == %d, hostA[i+1] == %d", i, hostA[i], hostA[i + 1]); 139 break; 140 } 141 } 142 #endif 143 144 free(listPlatform); 145 free(listDevice); 146 clReleaseContext(context); 147 clReleaseCommandQueue(queue); 148 clReleaseProgram(program); 149 clReleaseKernel(kernel); 150 clReleaseEvent(exeEvent); 151 free(hostA); 152 clReleaseMemObject(deviceA); 153 getchar(); 154 return 0; 155 }
● 输出结果,统一采用 (1<<20) 的数据规模,尝试不同的工作组大小。使用局部内存并没有明显提升,尤其是使用两个局部内存的方法,严重拖后腿。
bitonicSort01 // groupSize = 64 kernels time: 7.941984 ms // groupSize = 128 kernels time: 7.947360 ms // groupSize = 256 kernels time: 7.935616 ms // groupSize = 512 kernels time: 7.919776 ms // groupSize = 1024 kernels time: 7.970080 ms bitonicSort02 // groupSize = 64 kernels time: 7.980160 ms // groupSize = 128 kernels time: 7.957984 ms // groupSize = 256 kernels time: 7.964032 ms // groupSize = 512 kernels time: 7.959072 ms // groupSize = 1024 kernels time: 8.517120 ms bitonicSort03 // groupSize = 64 kernels time: 9.901120 ms // groupSize = 128 kernels time: 9.936384 ms // groupSize = 256 kernels time: 9.950976 ms // groupSize = 512 kernels time: 10.134176 ms // groupSize = 1024 kernels time: 10.533516 ms bitonicSort03(书中的串行写入) // groupSize = 64 kernels time: 51.590080 ms // groupSize = 128 kernels time: 31.607904 ms // groupSize = 256 kernels time: 35.344244 ms // groupSize = 512 kernels time: 50.841184 ms // groupSize = 1024 kernels time: 93.284544 ms
● 总结
■ CPU 版双调排序使用递归,代码比较简洁,也可以使用本篇中的方法家拿其转化为循环迭代。GPU暂时不使用递归,主要精力在于不停地向命令队列中入队不同层次的任务,同一层次内的任务可以并行执行
■ 这本书就是垃圾,就 T M D 是 垃 圾!①书里代码引用不全,比如核函数里上来就使用变量 threadId,声明定义都没有;②代码混乱,比如在 bitonicSort01 和 bitonicSort02 里 threadId 的含义是不同的,尤其书中 bitonicSort02 里同时用该变量表示 get_global_id(0) 和 get_local_id(0),一想就知道代码执行结果肯定是错的;③ 机翻,好不容易写到倒数第二章了,翻译都懒得弄了,这里抄一段原文看谁看得懂:“我们基本上执行的操作是引入名为 sharedMem 的变量,并且使用加载这些值的简单策略:每个线程将在共享内存数据存储器中存储两个值(邻近值),在随后的代码部分中读出此线程,并且用于引用全局内存的所有读取操作目前在本地 / 共享内存中执行”。④ 残缺不堪,感觉 bitonicSort03 的优化应该是有一定道理的,但是在这里完全没有看出该优化的目的和优势所在,书中也没有说清楚。
以上是关于OpenCL 双调排序 GPU 版的主要内容,如果未能解决你的问题,请参考以下文章
三十分钟理解:双调排序Bitonic Sort,适合并行计算的排序算法