使用cuda c减少计算数组的总和
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用cuda c减少计算数组的总和相关的知识,希望对你有一定的参考价值。
我遇到的一个问题是,当我运行我的cuda程序时,总是说在第167行的stats_gpu.cu中遇到了非法的内存访问。
代码如下:
#include <stdio.h>
#include <float.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
#include <cuda.h>
#define MAXIMUM_VALUE 1000000.0f
#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )
void HandleError( cudaError_t err, const char *file, int line ) {
//
// Handle and report on CUDA errors.
//
if ( err != cudaSuccess ) {
printf( "%s in %s at line %d
", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
void checkCUDAError( const char *msg, bool exitOnError ) {
//
// Check cuda error and print result if appropriate.
//
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.
", msg, cudaGetErrorString(err) );
if (exitOnError) {
exit(-1);
}
}
}
void cleanupCuda( void ) {
//
// Clean up CUDA resources.
//
//
// Explicitly cleans up all runtime-related resources associated with the
// calling host thread.
//
HANDLE_ERROR(
cudaThreadExit()
);
}
__device__ double device_pow( double x, double y ) {
//
// Calculate x^y on the GPU.
//
return pow( x, y );
}
//
// PLACE GPU KERNELS HERE - BEGIN
//
__global__ void kernel_calculate_sum(double *inputArray, double *outputArray, unsigned int n){
extern __shared__ double sdata[];
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int tid = threadIdx.x;
sdata[tid] = 0;
while(i < n){
sdata[tid] += inputArray[i];
i += blockDim.x * gridDim.x;
}
__syncthreads();
for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if(tid < s){
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if(tid == 0){
outputArray[blockIdx.x] = sdata[0];
}
}
//
// PLACE GPU KERNELS HERE - END
//
int main( int argc, char* argv[] ) {
//
// Determine min, max, mean, mode and standard deviation of array
//
unsigned int array_size, seed, i;
struct timeval start, end;
float runtime;
if( argc < 3 ) {
printf( "Format: stats_gpu <size of array> <random seed>
" );
printf( "Arguments:
" );
printf( " size of array - This is the size of the array to be generated and processed
" );
printf( " random seed - This integer will be used to seed the random number
" );
printf( " generator that will generate the contents of the array
" );
printf( " to be processed
" );
exit( 1 );
}
//
// Get the size of the array to process.
//
array_size = atoi( argv[1] );
//
// Get the seed to be used
//
seed = atoi( argv[2] );
//
// Make sure that CUDA resources get cleaned up on exit.
//
atexit( cleanupCuda );
//
// Record the start time.
//
gettimeofday( &start, NULL );
//
// Allocate the array to be populated.
//
double *array = (double *) malloc( array_size * sizeof( double ) );
//
// Seed the random number generator and populate the array with its values.
//
srand( seed );
for( i = 0; i < array_size; i++ )
array[i] = ( (double) rand() / (double) RAND_MAX ) * MAXIMUM_VALUE;
//
// Setup output variables to hold min, max, mean, and standard deviation
//
// YOUR CALCULATIONS BELOW SHOULD POPULATE THESE WITH RESULTS
//
double min = DBL_MAX;
double max = 0;
double sum = 0;
double mean = 0;
double stddev = 0;
unsigned int threadsPerBlock = 512;
unsigned int blocksPerGrid = (65535 < (array_size + threadsPerBlock - 1) / threadsPerBlock)?65535:(array_size + threadsPerBlock - 1) / threadsPerBlock;
double *dev_input_array, *dev_output_array, *outputArray;
outputArray = (double*)malloc( blocksPerGrid * sizeof(double) );
//
// CALCULATE VALUES FOR MIN, MAX, MEAN, and STDDEV - BEGIN
//
//Allocate memory on GPU
HANDLE_ERROR( cudaMalloc( (void**) &dev_input_array, array_size * sizeof( double ) ) );
HANDLE_ERROR( cudaMalloc( (void**) &dev_output_array, blocksPerGrid * sizeof( double ) ) );
//Copy data from host to GPU
HANDLE_ERROR( cudaMemcpy( dev_input_array, array, array_size * sizeof(double), cudaMemcpyHostToDevice ) );
//Execute kernel on GPU
kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock>>>(dev_input_array, dev_output_array, array_size);
//Copy data from GPU to host
HANDLE_ERROR( cudaMemcpy(outputArray, dev_output_array, blocksPerGrid * sizeof( double ), cudaMemcpyDeviceToHost));
for(unsigned int j = 0; j < blocksPerGrid; j++){
sum += outputArray[j];
}
//Deallocate memory on GPU
cudaFree(dev_input_array);
cudaFree(dev_output_array);
//
// CALCULATE VALUES FOR MIN, MAX, MEAN, and STDDEV - END
//
//
// Record the end time.
//
gettimeofday( &end, NULL );
//
// Calculate the runtime.
//
runtime = ( ( end.tv_sec - start.tv_sec ) * 1000.0 ) + ( ( end.tv_usec - start.tv_usec ) / 1000.0 );
//
// Output discoveries from the array.
//
printf( "Statistics for array ( %d, %d ):
", array_size, seed );
printf( " Minimum = %4.6f, Maximum = %4.6f
", min, max );
printf( " Mean = %4.6f, Standard Deviation = %4.6f
", mean, stddev );
printf( "Processing Time: %4.4f milliseconds
", runtime );
//
// Free the allocated array.
//
free( array );
free(outputArray);
return 0;
}
答案
报告的错误发生在内核调用后的cudaMemcpy
操作上:
HANDLE_ERROR( cudaMemcpy(outputArray, dev_output_array, blocksPerGrid * sizeof( double ), cudaMemcpyDeviceToHost));
(将来,如果您确定这是代码中的第167行,则对其他人有帮助)
CUDA内核错误报告是异步的,因此该错误实际上表明内核代码中存在非法内存访问。
如果你用cuda-memcheck
运行你的代码,你会看到很多这样的迹象:
========= Invalid __shared__ write of size 8
========= at 0x00000068 in /.../....cu:64:kernel_calculate_sum(double*, double*, unsigned int)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x00000000 is out of bounds
...
所以你的共享内存访问有问题。使用动态分配的共享内存时,如下所示:
extern __shared__ double sdata[];
有必要将共享内存的大小作为内核参数传递。但你没有这样做:
kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock>>>(dev_input_array, dev_output_array, array_size);
^
missing shared size parameter
所以你的共享内存访问失败了。对内核调用的简单修改:
kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock, threadsPerBlock*sizeof(double)>>>(dev_input_array, dev_output_array, array_size);
解决了这个错误。这是必要的,因为你的内核需要每个线程一个double
数量,所以你必须保留你的threadblock的大小乘以double
的大小。
以上是关于使用cuda c减少计算数组的总和的主要内容,如果未能解决你的问题,请参考以下文章