0_Simple__cdpSimpleQuicksort

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了0_Simple__cdpSimpleQuicksort相关的知识,希望对你有一定的参考价值。

利用CUDA动态并行实现快排算法(有单线程的递归调用)

源代码:

  1 #include <iostream>
  2 #include <cstdio>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <D:\Program\CUDA\Samples\common\inc\helper_cuda.h>
  6 #include <D:\Program\CUDA\Samples\common\inc\helper_string.h> 
  7 
  8 #define MAX_DEPTH       16
  9 #define INSERTION_SORT  32
 10 
 11 // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 是使用选排。设备代码, 单线程完成。
 12 __device__ void selection_sort(unsigned int *data, int left, int right)
 13 {
 14     for (int i = left; i <= right; ++i)
 15     {
 16         unsigned min_val = data[i];
 17         int min_idx = i;
 18 
 19         // 找最小元素及其下标
 20         for (int j = i + 1; j <= right; ++j)
 21         {
 22             unsigned val_j = data[j];
 23 
 24             if (val_j < min_val)
 25             {
 26                 min_idx = j;
 27                 min_val = val_j;
 28             }
 29         }
 30 
 31         // 交换第 i 号元素到指定的位置上
 32         if (i != min_idx)
 33         {
 34             data[min_idx] = data[i];
 35             data[i] = min_val;
 36         }
 37     }
 38 }
 39 
 40 // 快排主体,内含递归调用,每个函数调用都是单线程
 41 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
 42 {
 43     // 处理适用选排的情况
 44     if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT)
 45     {
 46         selection_sort(data, left, right);
 47         return;
 48     }
 49 
 50     unsigned int *lptr = data + left;
 51     unsigned int *rptr = data + right;
 52     unsigned int  pivot = data[(left + right) / 2];
 53 
 54     // 分割
 55     while (lptr <= rptr)
 56     {
 57         // 指定左指针指向的值和右指针指向的值
 58         unsigned int lval = *lptr;
 59         unsigned int rval = *rptr;
 60 
 61         // 左指针递增
 62         while (lval < pivot)
 63         {
 64             lptr++;
 65             lval = *lptr;
 66         }
 67 
 68         // 右指针递减
 69         while (rval > pivot)
 70         {
 71             rptr--;
 72             rval = *rptr;
 73         }
 74 
 75         // 交换左右指针指向的值
 76         if (lptr <= rptr)
 77         {
 78             *lptr++ = rval;
 79             *rptr-- = lval;
 80         }
 81     }
 82 
 83     // 获得左右分区的范围
 84     int nright = rptr - data;
 85     int nleft = lptr - data;
 86 
 87     // 将左右分区放到两个不同的流中
 88     if (left < (rptr - data))
 89     {
 90         cudaStream_t s;
 91         cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
 92         cdp_simple_quicksort << < 1, 1, 0, s >> >(data, left, nright, depth + 1);
 93         cudaStreamDestroy(s);
 94     }
 95     if ((lptr - data) < right)
 96     {
 97         cudaStream_t s1;
 98         cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
 99         cdp_simple_quicksort << < 1, 1, 0, s1 >> >(data, nleft, right, depth + 1);
100         cudaStreamDestroy(s1);
101     }
102 }
103 
104 // 快排的入口函数,注意使用单线程启动核函数
105 void run_qsort(unsigned int *data, unsigned int nitems)
106 {
107     // 设置最大递归深度
108     cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH);
109 
110     // 调用快排函数
111     int left = 0;
112     int right = nitems - 1;
113     std::cout << "Launching kernel on the GPU" << std::endl; 
114     cdp_simple_quicksort << < 1, 1 >> >(data, left, right, 0);
115     cudaDeviceSynchronize();
116 }
117 
118 // 数据初始化
119 void initialize_data(unsigned int *dst, unsigned int nitems)
120 {
121     srand(2047);
122     for (unsigned i = 0; i < nitems; i++)
123         dst[i] = rand() % nitems;
124 }
125 
126 // 检查结果
127 void check_results(int n, unsigned int *results_d)
128 {
129     unsigned int *results_h = new unsigned[n];
130     cudaMemcpy(results_h, results_d, n * sizeof(unsigned), cudaMemcpyDeviceToHost);
131 
132     for (int i = 1; i < n; ++i)
133         if (results_h[i - 1] > results_h[i])
134         {
135             std::cout << "Invalid item[" << i - 1 << "]: " << results_h[i - 1] << " greater than " << results_h[i] << std::endl;
136             exit(EXIT_FAILURE);
137         }
138 
139     std::cout << "OK" << std::endl;
140     delete[] results_h;
141 }
142 
143 int main(int argc, char **argv)
144 {
145     int num_items = 128;
146     bool verbose = false;// 是否检查初始化后的 h_data
147 
148     // 帮助模式?
149     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))
150     {
151         std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of items to sort" << std::endl;
152         exit(EXIT_SUCCESS);
153     }
154 
155     // 查看模式,查看随机数组 h_data 的内容
156     if (checkCmdLineFlag(argc, (const char **)argv, "v"))
157         verbose = true;
158 
159     // 手动设定待排数组大小
160     if (checkCmdLineFlag(argc, (const char **)argv, "num_items"))
161     {
162         num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items");
163         if (num_items < 1)
164         {
165             std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
166             exit(EXIT_FAILURE);
167         }
168     }
169 
170     // 设备相关
171     int device_count = 0, device = -1;
172     if (checkCmdLineFlag(argc, (const char **)argv, "device"))// 命令行指定了设备
173     {
174         device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
175 
176         cudaDeviceProp properties;
177         cudaGetDeviceProperties(&properties, device);
178 
179         if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
180             std::cout << "Running on GPU " << device << " (" << properties.name << ")" << std::endl;
181         else
182         {
183             std::cout << "ERROR: cdpsimpleQuicksort requires GPU devices with compute SM 3.5 or higher." << std::endl;
184             std::cout << "Current GPU device has compute SM" << properties.major << "." << properties.minor << ". Exiting..." << std::endl;
185             exit(EXIT_FAILURE);
186         }
187     }
188     else// 命令行没有指定设备,自动寻找
189     {
190         cudaGetDeviceCount(&device_count);
191         for (int i = 0; i < device_count; ++i)
192         {
193             cudaDeviceProp properties;
194             cudaGetDeviceProperties(&properties, i);
195 
196             if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5))
197             {
198                 device = i;
199                 std::cout << "Running on GPU " << i << " (" << properties.name << ")" << std::endl;
200                 break;
201             }
202             std::cout << "GPU " << i << " (" << properties.name << ") does not support CUDA Dynamic Parallelism" << std::endl;
203         }
204     }
205 
206     if (device == -1)
207     {
208         std::cerr << "cdpSimpleQuicksort requires GPU devices with compute SM 3.5 or higher.  Exiting..." << std::endl;
209         exit(EXIT_WAIVED);
210     }
211 
212     cudaSetDevice(device);
213 
214     // 创建待排数据
215     unsigned int *h_data = 0;
216     unsigned int *d_data = 0;
217 
218     std::cout << "Initializing data." << std::endl;
219     h_data = (unsigned int *)malloc(num_items * sizeof(unsigned int));
220     initialize_data(h_data, num_items);
221 
222     if (verbose)
223     {
224         for (int i = 0; i<num_items; i++)
225             std::cout << "Data [" << i << "]: " << h_data[i] << std::endl;
226     }
227 
228     // 数据搬进显存
229     cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int));
230     cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice);
231 
232     // 运行快排入口函数
233     std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
234     run_qsort(d_data, num_items);
235 
236     // 检查结果
237     std::cout << "Validating results: ";
238     check_results(num_items, d_data);
239 
240     free(h_data);
241     cudaFree(d_data);
242 
243     getchar();
244     exit(EXIT_SUCCESS);
245 }

 

? 输出结果:

Running on GPU 0 (GeForce GTX 1070)
Initializing data.
Running quicksort on 128 elements
Launching kernel on the GPU
Validating results : OK

 

? 新姿势:

● C++动态数组

1 unsigned int *h = new unsigned[n];
2 delete[] h;

 

● checkCmdLineFlag 用于检验函数参数argv是否等于字符串string_ref(定义于helper_string.h中)

 1 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
 2 {
 3     bool bFound = false;
 4     if (argc >= 1)
 5     {
 6         for (int i = 1; i < argc; i++)
 7         {
 8             int string_start = stringRemoveDelimiter(-, argv[i]);
 9             const char *string_argv = &argv[i][string_start];
10 
11             const char *equal_pos = strchr(string_argv, =);
12             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
13 
14             int length = (int)strlen(string_ref);
15 
16             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
17             {
18                 bFound = true;
19                 continue;
20             }
21         }
22     }
23     return bFound;
24 }
25 
26 // 其中的函数 stringRemoveDelimiter 用于去除特定的符号,上述函数的中用于去除参数前面的 - 或 --
27 inline int stringRemoveDelimiter(char delimiter, const char *string)
28 {
29     int string_start = 0;
30 
31     while (string[string_start] == delimiter)
32     {
33         string_start++;
34     }
35 
36     if (string_start >= (int)strlen(string) - 1)
37     {
38         return 0;
39     }
40 
41     return string_start;
42 }
43 
44 // 其中的宏 STRNCASECMP 用于比较字符串(定义于string.h中)
45 #define STRNCASECMP _strnicmp
46 
47 _ACRTIMP int __cdecl _strnicmp 48 (
49     _In_reads_or_z_(_MaxCount) char const* _String1,
50     _In_reads_or_z_(_MaxCount) char const* _String2,
51     _In_                       size_t      _MaxCount
52 );

 

● getCmdLineArgumentInt 用于提取函数参数argv中的整数(定义于helper_string.h中)

 1 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
 2 {
 3     bool bFound = false;
 4     int value = -1;
 5 
 6     if (argc >= 1)
 7     {
 8         for (int i = 1; i < argc; i++)
 9         {
10             int string_start = stringRemoveDelimiter(-, argv[i]);
11             const char *string_argv = &argv[i][string_start];
12             int length = (int)strlen(string_ref);
13 
14             if (!STRNCASECMP(string_argv, string_ref, length))
15             {
16                 if (length + 1 <= (int)strlen(string_argv))
17                 {
18                     int auto_inc = (string_argv[length] == =) ? 1 : 0;
19                     value = atoi(&string_argv[length + auto_inc]);
20                 }
21                 else
22                     value = 0;
23                 bFound = true;
24                 continue;
25             }
26         }
27     }
28     if (bFound)
29         return value;
30     else
31         return 0;

 

● 设置CUDA各项参数的大小,源代码中用于指定最大递归深度

extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);

 

● 带有标识符的 cudaStreamCreateWithFlags ,用于设置流的优先级

extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);

  对比 cudaStreamCreate

extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);

 

以上是关于0_Simple__cdpSimpleQuicksort的主要内容,如果未能解决你的问题,请参考以下文章

0_Simple__cppOverload

0_Simple__cudaOpenMP

0_Simple__simpleCallback

0_Simple__cdpSimpleQuicksort

0_Simple__clock

0_Simple__cppIntegration