CUDA Thrust 大幅减少
Posted
技术标签:
【中文标题】CUDA Thrust 大幅减少【英文标题】:Strided reduction by CUDA Thrust 【发布时间】:2014-09-11 02:14:22 【问题描述】:我有一个具有这种结构的顶点数组:
[x0, y0, z0, empty float, x1, y1, z1, empty float, x2, y2, z2, empty float, ...]
我需要使用 CUDA 找到 minX
、minY
、minZ
、maxX
、maxY
和 maxZ
。我写了一个适当的缩减算法,但它似乎有点太慢了。我决定使用 THRUST 库。有一个高度优化的reduce()
,甚至更好的minmax_element()
,这是一种同时找到数组的最大值和最小值的方法,但我找不到一个快速的方法来使用每个4
th 索引.将数据复制到 3
分隔数组不是我正在寻找的解决方案。
有没有办法(使用 Thrust 迭代器或类似的技巧)将步幅传递给 reduce()
?
【问题讨论】:
或许这篇cuda thrust library repeat vector multiple times 的帖子可以提供一些提示。 可以使用跨度范围、推力 minmax_element 和 3 次推力调用来产生 6 个结果,类似于 here。或者使用带有跨步范围迭代器的thrust::reduce_by_key 将其减少到2 个调用(一个产生一组最大结果,另一个产生一组最小结果),与here 相关。目前我想不出一种方法可以通过一次推力调用获得所有 6 个结果。 对于优化好的代码,这个操作很可能会受到内存带宽的限制。我认为推力实现可能不如编写良好的内核那么高效(因为您不希望重新组织数据),它可以为每个线程加载一个float4
结构元素(因此完全合并),并计算 (通过并行归约)同时运行 6 个归约,同时获得 6 个所需结果。
如果数组中的元素真的很密集,你应该可以将reinterpret_cast
指向第一个float
的指针指向float4
的指针。将float4 *
赋予thrust::reduce
并编写一个特殊的归约函子来计算所有六个值。
@JaredHoberock 如果归约 binary_op 函子接受 2 个 float4
值,它是否必须产生 float4
结果?如何将 6 个 float
数量打包到 float4
结果中?
【参考方案1】:
您可以使用 strided range method 以及对 thrust::minmax_element 的 3 次调用来获得所需的结果,而无需修改数据存储。
这是一个有效的例子:
$ cat t491.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <thrust/copy.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <thrust/transform_reduce.h>
#define DSIZE (1048576*2)
#define SSIZE 4
#define FSIZE (DSIZE*SSIZE)
#define nTPB 256
#define BSIZE nTPB
#define nBLKS 64
#define FLOAT_MIN (-99999)
#define FLOAT_MAX 99999
typedef thrust::tuple<float, float, float, float, float, float> tpl6;
struct expand_functor
__host__ __device__
tpl6 operator()(const float4 a)
tpl6 result;
result.get<0>() = a.x;
result.get<1>() = a.x;
result.get<2>() = a.y;
result.get<3>() = a.y;
result.get<4>() = a.z;
result.get<5>() = a.z;
return result;
;
struct minmax3_functor
__host__ __device__
tpl6 operator()(const tpl6 a, const tpl6 b)
tpl6 result;
result.get<0>() = (a.get<0>() < b.get<0>()) ? a.get<0>():b.get<0>();
result.get<1>() = (a.get<1>() > b.get<1>()) ? a.get<1>():b.get<1>();
result.get<2>() = (a.get<2>() < b.get<2>()) ? a.get<2>():b.get<2>();
result.get<3>() = (a.get<3>() > b.get<3>()) ? a.get<3>():b.get<3>();
result.get<4>() = (a.get<4>() < b.get<4>()) ? a.get<4>():b.get<4>();
result.get<5>() = (a.get<5>() > b.get<5>()) ? a.get<5>():b.get<5>();
return result;
;
__device__ int bcount = 0;
__device__ float xmins[nBLKS];
__device__ float xmaxs[nBLKS];
__device__ float ymins[nBLKS];
__device__ float ymaxs[nBLKS];
__device__ float zmins[nBLKS];
__device__ float zmaxs[nBLKS];
__global__ void my_minmax3(float4 *data, float *results, size_t dsize)
// assumes power-of-2 threadblock size
// assumes nBLKS <= nTPB, nBLKS also power-of-2
__shared__ float xmin[BSIZE], xmax[BSIZE], ymin[BSIZE], ymax[BSIZE], zmin[BSIZE], zmax[BSIZE];
__shared__ int lblock;
float my_xmin = FLOAT_MAX;
float my_ymin = FLOAT_MAX;
float my_zmin = FLOAT_MAX;
float my_xmax = FLOAT_MIN;
float my_ymax = FLOAT_MIN;
float my_zmax = FLOAT_MIN;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < dsize)
float4 my_temp = data[idx];
if (my_xmin > my_temp.x) my_xmin = my_temp.x;
if (my_ymin > my_temp.y) my_ymin = my_temp.y;
if (my_zmin > my_temp.z) my_zmin = my_temp.z;
if (my_xmax < my_temp.x) my_xmax = my_temp.x;
if (my_ymax < my_temp.y) my_ymax = my_temp.y;
if (my_zmax < my_temp.z) my_zmax = my_temp.z;
idx += blockDim.x*gridDim.x;
xmin[threadIdx.x] = my_xmin;
ymin[threadIdx.x] = my_ymin;
zmin[threadIdx.x] = my_zmin;
xmax[threadIdx.x] = my_xmax;
ymax[threadIdx.x] = my_ymax;
zmax[threadIdx.x] = my_zmax;
__syncthreads();
for (int i = blockDim.x/2; i > 0; i>>=1)
if (threadIdx.x < i)
if (xmin[threadIdx.x] > xmin[threadIdx.x+i]) xmin[threadIdx.x] = xmin[threadIdx.x + i];
if (ymin[threadIdx.x] > ymin[threadIdx.x+i]) ymin[threadIdx.x] = ymin[threadIdx.x + i];
if (zmin[threadIdx.x] > zmin[threadIdx.x+i]) zmin[threadIdx.x] = zmin[threadIdx.x + i];
if (xmax[threadIdx.x] < xmax[threadIdx.x+i]) xmax[threadIdx.x] = xmax[threadIdx.x + i];
if (ymax[threadIdx.x] < ymax[threadIdx.x+i]) ymax[threadIdx.x] = ymax[threadIdx.x + i];
if (zmax[threadIdx.x] < zmax[threadIdx.x+i]) zmax[threadIdx.x] = zmax[threadIdx.x + i];
__syncthreads();
if (!threadIdx.x)
xmins[blockIdx.x] = xmin[0];
xmaxs[blockIdx.x] = xmax[0];
ymins[blockIdx.x] = ymin[0];
ymaxs[blockIdx.x] = ymax[0];
zmins[blockIdx.x] = zmin[0];
zmaxs[blockIdx.x] = zmax[0];
__threadfence();
if (atomicAdd(&bcount, 1) == (nBLKS-1)) lblock = 1;
else lblock = 0;
__syncthreads();
if (lblock) // last block does final reduction
if (threadIdx.x < nBLKS)
xmin[threadIdx.x] = xmins[threadIdx.x];
xmax[threadIdx.x] = xmaxs[threadIdx.x];
ymin[threadIdx.x] = ymins[threadIdx.x];
ymax[threadIdx.x] = ymaxs[threadIdx.x];
zmin[threadIdx.x] = zmins[threadIdx.x];
zmax[threadIdx.x] = zmaxs[threadIdx.x];
__syncthreads();
for (int i = nBLKS/2; i > 0; i>>=1)
if (threadIdx.x < i)
if (xmin[threadIdx.x] > xmin[threadIdx.x+i]) xmin[threadIdx.x] = xmin[threadIdx.x + i];
if (ymin[threadIdx.x] > ymin[threadIdx.x+i]) ymin[threadIdx.x] = ymin[threadIdx.x + i];
if (zmin[threadIdx.x] > zmin[threadIdx.x+i]) zmin[threadIdx.x] = zmin[threadIdx.x + i];
if (xmax[threadIdx.x] < xmax[threadIdx.x+i]) xmax[threadIdx.x] = xmax[threadIdx.x + i];
if (ymax[threadIdx.x] < ymax[threadIdx.x+i]) ymax[threadIdx.x] = ymax[threadIdx.x + i];
if (zmax[threadIdx.x] < zmax[threadIdx.x+i]) zmax[threadIdx.x] = zmax[threadIdx.x + i];
__syncthreads();
if (!threadIdx.x)
results[0] = xmin[0];
results[1] = xmax[0];
results[2] = ymin[0];
results[3] = ymax[0];
results[4] = zmin[0];
results[5] = zmax[0];
template <typename Iterator>
class strided_range
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
difference_type stride;
stride_functor(difference_type stride)
: stride(stride)
__host__ __device__
difference_type operator()(const difference_type& i) const
return stride * i;
;
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride)
iterator begin(void) const
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
iterator end(void) const
return begin() + ((last - first) + (stride - 1)) / stride;
protected:
Iterator first;
Iterator last;
difference_type stride;
;
typedef thrust::device_vector<float>::iterator Iter;
typedef strided_range<Iter>::iterator sIter;
int main()
// set up test data
cudaEvent_t start, stop;
float et;
cudaEventCreate(&start); cudaEventCreate(&stop);
thrust::host_vector<float> h_vals(FSIZE);
for (int i = 0; i < DSIZE; i ++)
h_vals[i*SSIZE + 0] = rand()/(float)RAND_MAX;
h_vals[i*SSIZE + 1] = rand()/(float)RAND_MAX;
h_vals[i*SSIZE + 2] = rand()/(float)RAND_MAX;
h_vals[i*SSIZE + 3] = 0.0f;
thrust::device_vector<float> d_vals = h_vals;
// set up strided ranges
strided_range<Iter> item_x(d_vals.begin() , d_vals.end(), SSIZE);
strided_range<Iter> item_y(d_vals.begin()+1, d_vals.end(), SSIZE);
strided_range<Iter> item_z(d_vals.begin()+2, d_vals.end(), SSIZE);
// find min and max
cudaEventRecord(start);
thrust::pair<sIter, sIter> result_x = thrust::minmax_element(item_x.begin(), item_x.end());
thrust::pair<sIter, sIter> result_y = thrust::minmax_element(item_y.begin(), item_y.end());
thrust::pair<sIter, sIter> result_z = thrust::minmax_element(item_z.begin(), item_z.end());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
std::cout << "thrust elapsed time: " << et << "ms" << std::endl;
std::cout << "thrust results: " << std::endl;
std::cout << "x min element = " << *(result_x.first) << std::endl;
std::cout << "x max element = " << *(result_x.second) << std::endl;
std::cout << "y min element = " << *(result_y.first) << std::endl;
std::cout << "y max element = " << *(result_y.second) << std::endl;
std::cout << "z min element = " << *(result_z.first) << std::endl;
std::cout << "z max element = " << *(result_z.second) << std::endl;
float *h_results, *d_results;
h_results = new float[6];
cudaMalloc(&d_results, 6*sizeof(float));
cudaEventRecord(start);
my_minmax3<<<nBLKS,nTPB>>>((float4 *)thrust::raw_pointer_cast(d_vals.data()), d_results, DSIZE);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
cudaMemcpy(h_results, d_results, 6*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "kernel elapsed time: " << et << "ms" << std::endl;
std::cout << "kernel results: " << std::endl;
std::cout << "x min element = " << h_results[0] << std::endl;
std::cout << "x max element = " << h_results[1] << std::endl;
std::cout << "y min element = " << h_results[2] << std::endl;
std::cout << "y max element = " << h_results[3] << std::endl;
std::cout << "z min element = " << h_results[4] << std::endl;
std::cout << "z max element = " << h_results[5] << std::endl;
thrust::device_ptr<float4> dptr_vals = thrust::device_pointer_cast(reinterpret_cast<float4 *>( thrust::raw_pointer_cast(d_vals.data())));
tpl6 my_init;
my_init.get<0>() = FLOAT_MAX;
my_init.get<1>() = FLOAT_MIN;
my_init.get<2>() = FLOAT_MAX;
my_init.get<3>() = FLOAT_MIN;
my_init.get<4>() = FLOAT_MAX;
my_init.get<5>() = FLOAT_MIN;
cudaEventRecord(start);
tpl6 my_result = thrust::transform_reduce(dptr_vals, dptr_vals + DSIZE, expand_functor(), my_init, minmax3_functor());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
cudaMemcpy(h_results, d_results, 6*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "thrust2 elapsed time: " << et << "ms" << std::endl;
std::cout << "thrust2 results: " << std::endl;
std::cout << "x min element = " << my_result.get<0>() << std::endl;
std::cout << "x max element = " << my_result.get<1>() << std::endl;
std::cout << "y min element = " << my_result.get<2>() << std::endl;
std::cout << "y max element = " << my_result.get<3>() << std::endl;
std::cout << "z min element = " << my_result.get<4>() << std::endl;
std::cout << "z max element = " << my_result.get<5>() << std::endl;
return 0;
$ nvcc -O3 -arch=sm_20 -o t491 t491.cu
$ ./t491
thrust elapsed time: 3.88784ms
thrust results:
x min element = 1.16788e-06
x max element = 0.999998
y min element = 2.85916e-07
y max element = 1
z min element = 1.72295e-08
z max element = 0.999999
kernel elapsed time: 0.462848ms
kernel results:
x min element = 1.16788e-06
x max element = 0.999998
y min element = 2.85916e-07
y max element = 1
z min element = 1.72295e-08
z max element = 0.999999
thrust2 elapsed time: 1.29728ms
thrust2 results:
x min element = 1.16788e-06
x max element = 0.999998
y min element = 2.85916e-07
y max element = 1
z min element = 1.72295e-08
z max element = 0.999999
$
我已经更新了上面的示例,以包含一个“优化的”缩减内核进行比较,该内核在单个内核调用中执行所有 6 次缩减(最小和最大操作)。
正如预期的那样,这种方法比 3 次单独的推力调用更快地产生相同的结果,在我的情况下大约快 5-8 倍(RHEL5.5、Quadro5000、CUDA 6.5RC),具体取决于数据大小。请注意,尽管我在这里选择了 2 的幂的数据大小 (DSIZE
),但整个示例对于任意数据大小都可以正常工作。为了简洁起见,我省略了proper cuda error checking。
编辑:根据@JaredHoberock 的建议,我添加了第三种方法,允许一次调用thrust::transform_reduce
产生所有6 个结果。这些是上面的“thrust2”结果。这种方法比第一种(三推力调用)方法快大约 3 倍。仍然不如 cuda 内核方法快,但也许这种推力方法可以进一步优化。
【讨论】:
由于这是一个并行缩减问题,请注意Kepler architecture introduced Warp Shuffle intrinsics,它可用于加速缩减之前的共享内存模式。【参考方案2】:这是strided range example的一个应用程序。
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 2
template <typename Iterator>
class strided_range
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
difference_type stride;
stride_functor(difference_type stride)
: stride(stride)
__host__ __device__
difference_type operator()(const difference_type& i) const
return stride * i;
;
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride)
iterator begin(void) const
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
iterator end(void) const
return begin() + ((last - first) + (stride - 1)) / stride;
protected:
Iterator first;
Iterator last;
difference_type stride;
;
int main(void)
const int N = 9;
thrust::host_vector<int> h_data(N);
for (int i=0; i<N; i++) h_data[i] = i;
thrust::device_vector<int> data(h_data);
typedef thrust::device_vector<int>::iterator Iterator;
strided_range<Iterator> pos(data.begin(), data.end(), STRIDE);
int sum = thrust::reduce(pos.begin(), pos.end(), (int) 0, thrust::plus<int>());
printf("sum = %i\n",sum);
return 0;
【讨论】:
【参考方案3】:我按照这个文档优化了我的内核:http://www.cuvilib.com/Reduction.pdf
同样的文档(文件更小)也可以在这里找到:https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
现在看起来像这样:
template<unsigned int blockSize>
__global__ void minMaxReduction_k(float *g_minData, float *g_maxData, float *g_minOutput, float *g_maxOutput, unsigned int n)
extern __shared__ float shared[];
float* minSdata = (float*)shared;
float* maxSdata = (float*)&minSdata[4*blockDim.x];
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
minSdata[4*tid] = FLT_MAX;
minSdata[4*tid+1] = FLT_MAX;
minSdata[4*tid+2] = FLT_MAX;
maxSdata[4*tid] = -FLT_MAX;
maxSdata[4*tid+1] = -FLT_MAX;
maxSdata[4*tid+2] = -FLT_MAX;
while(i<n)
minSdata[4*tid] = fminf(fminf(minSdata[4*tid], g_minData[4*i]), g_minData[4*(i+blockDim.x)]);
minSdata[4*tid+1] = fminf(fminf(minSdata[4*tid+1], g_minData[4*i+1]), g_minData[4*(i+blockDim.x)+1]);
minSdata[4*tid+2] = fminf(fminf(minSdata[4*tid+2], g_minData[4*i+2]), g_minData[4*(i+blockDim.x)+2]);
maxSdata[4*tid] = fmaxf(fmaxf(maxSdata[4*tid], g_maxData[4*i]), g_maxData[4*(i+blockDim.x)]);
maxSdata[4*tid+1] = fmaxf(fmaxf(maxSdata[4*tid+1], g_maxData[4*i+1]), g_maxData[4*(i+blockDim.x)+1]);
maxSdata[4*tid+2] = fmaxf(fmaxf(maxSdata[4*tid+2], g_maxData[4*i+2]), g_maxData[4*(i+blockDim.x)+2]);
i+=gridSize;
__syncthreads();
if(blockSize >= 1024)
if(tid < 512)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+512)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+512)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+512)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+512)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+512)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+512)+2]);
__syncthreads();
if(blockSize >= 512)
if(tid < 256)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+256)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+256)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+256)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+256)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+256)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+256)+2]);
__syncthreads();
if(blockSize >= 256)
if(tid < 128)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+128)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+128)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+128)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+128)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+128)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+128)+2]);
__syncthreads();
if(blockSize >= 128)
if(tid < 64)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+64)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+64)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+64)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+64)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+64)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+64)+2]);
__syncthreads();
if(tid<32)
if (blockSize >= 64)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+32)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+32)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+32)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+32)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+32)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+32)+2]);
//
if (blockSize >= 32)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+16)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+16)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+16)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+16)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+16)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+16)+2]);
//
if (blockSize >= 16)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+8)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+8)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+8)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+8)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+8)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+8)+2]);
//
if (blockSize >= 8)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+4)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+4)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+4)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+4)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+4)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+4)+2]);
//
if (blockSize >= 4)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+2)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+2)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+2)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+2)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+2)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+2)+2]);
//
if (blockSize >= 2)
minSdata[4*tid] = fminf(minSdata[4*tid], minSdata[4*(tid+1)]);
minSdata[4*tid+1] = fminf(minSdata[4*tid+1], minSdata[4*(tid+1)+1]);
minSdata[4*tid+2] = fminf(minSdata[4*tid+2], minSdata[4*(tid+1)+2]);
maxSdata[4*tid] = fmaxf(maxSdata[4*tid], maxSdata[4*(tid+1)]);
maxSdata[4*tid+1] = fmaxf(maxSdata[4*tid+1], maxSdata[4*(tid+1)+1]);
maxSdata[4*tid+2] = fmaxf(maxSdata[4*tid+2], maxSdata[4*(tid+1)+2]);
// write result for this block to global mem
if (tid == 0)
g_minOutput[blockIdx.x] = minSdata[0];
g_minOutput[blockIdx.x+1] = minSdata[1];
g_minOutput[blockIdx.x+2] = minSdata[2];
g_maxOutput[blockIdx.x] = maxSdata[0];
g_maxOutput[blockIdx.x+1] = maxSdata[1];
g_maxOutput[blockIdx.x+2] = maxSdata[2];
这样调用:
float *d_minOutput;
float *d_maxOutput;
int tempN = n;
while(tempN>1)
getNumBlocksAndThreads(tempN, 65535, 1024, blocks, threads);
cudaMalloc((void **)&d_minOutput, 4*(sizeof(float)*blocks));
cudaMalloc((void **)&d_maxOutput, 4*(sizeof(float)*blocks));
int smem = (threads <= 32) ? 2*2*4*threads*sizeof(float) : 2*4*threads*sizeof(float);
switch(threads)
case 1024:
minMaxReduction_k<1024><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 512:
minMaxReduction_k<512><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 256:
minMaxReduction_k<256><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 128:
minMaxReduction_k<128><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 64:
minMaxReduction_k<64><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 32:
minMaxReduction_k<32><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 16:
minMaxReduction_k<16><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 8:
minMaxReduction_k<8><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 4:
minMaxReduction_k<4><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 2:
minMaxReduction_k<2><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
case 1:
minMaxReduction_k<1><<< blocks, threads, smem >>>(d_minData, d_maxData, d_minOutput, d_maxOutput, tempN);
break;
tempN = blocks;
cudaMemcpy(d_minData, d_minOutput, 4*(sizeof(float)*blocks), cudaMemcpyDeviceToDevice);
cudaMemcpy(d_maxData, d_maxOutput, 4*(sizeof(float)*blocks), cudaMemcpyDeviceToDevice);
cudaFree(d_minOutput);
cudaFree(d_maxOutput);
助手:
void UniformGrid::getNumBlocksAndThreads(unsigned int n, unsigned int maxBlocks, unsigned int maxThreads, unsigned int &blocks, unsigned int &threads)
//get device capability, to avoid block/grid size excceed the upbound
cudaDeviceProp prop;
int device;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
if ((float)threads*blocks > (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock)
printf("n is too large, please choose a smaller number!\n");
if (blocks > (unsigned int) prop.maxGridSize[0])
printf("Grid size <%d> excceeds the device capability <%d>, set block size as %d (original %d)\n",
blocks, prop.maxGridSize[0], threads*2, threads);
blocks /= 2;
threads *= 2;
unsigned int UniformGrid::nextPow2(unsigned int x)
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
几天后我可能会测试哪种解决方案更快。
【讨论】:
我想你可以用for
循环替换重复的if(blockSize >= 1024)
,从你的代码中删除至少80%的行,在调用中完全删除switch
(你可以直接调用minMaxReduction_k<threads>...
等。 以上是关于CUDA Thrust 大幅减少的主要内容,如果未能解决你的问题,请参考以下文章
在thrust::device_vector (CUDA Thrust) 上的thrust::min_element 崩溃