使用cuda c减少计算数组的总和

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用cuda c减少计算数组的总和相关的知识,希望对你有一定的参考价值。

我遇到的一个问题是,当我运行我的cuda程序时,总是说在第167行的stats_gpu.cu中遇到了非法的内存访问。

代码如下:

#include <stdio.h>
#include <float.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
#include <cuda.h>

#define MAXIMUM_VALUE   1000000.0f
#define HANDLE_ERROR( err )  ( HandleError( err, __FILE__, __LINE__ ) )

void HandleError( cudaError_t err, const char *file, int line ) {
  //
  // Handle and report on CUDA errors.
  //
  if ( err != cudaSuccess ) {
    printf( "%s in %s at line %d
", cudaGetErrorString( err ), file, line );

    exit( EXIT_FAILURE );
  }
}

void checkCUDAError( const char *msg, bool exitOnError ) {
  //
  // Check cuda error and print result if appropriate.
  //
  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) {
      fprintf(stderr, "Cuda error: %s: %s.
", msg, cudaGetErrorString(err) );
      if (exitOnError) {
        exit(-1);
      }
  }
}

void cleanupCuda( void ) {
  //
  // Clean up CUDA resources.
  //

  //
  // Explicitly cleans up all runtime-related resources associated with the
  // calling host thread.
  //
  HANDLE_ERROR(
         cudaThreadExit()
         );
}

__device__ double device_pow( double x, double y ) {
  //
  // Calculate x^y on the GPU.
  //
  return pow( x, y );
}

//
// PLACE GPU KERNELS HERE - BEGIN
//
__global__ void kernel_calculate_sum(double *inputArray, double *outputArray, unsigned int n){
    extern __shared__ double sdata[];
    unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int tid = threadIdx.x;
    sdata[tid] = 0;
    while(i < n){
        sdata[tid] += inputArray[i];
        i += blockDim.x * gridDim.x;
    }
    __syncthreads();
    for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
        if(tid < s){
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if(tid == 0){
        outputArray[blockIdx.x] = sdata[0];
    }

}

//
// PLACE GPU KERNELS HERE - END
//

int main( int argc, char* argv[] ) {
  //
  // Determine min, max, mean, mode and standard deviation of array
  //
  unsigned int array_size, seed, i;
  struct timeval start, end;
  float runtime;

  if( argc < 3 ) {
    printf( "Format: stats_gpu <size of array> <random seed>
" );
    printf( "Arguments:
" );
    printf( "  size of array - This is the size of the array to be generated and processed
" );
    printf( "  random seed   - This integer will be used to seed the random number
" );
    printf( "                  generator that will generate the contents of the array
" );
    printf( "                  to be processed
" );

    exit( 1 );
  }

  //
  // Get the size of the array to process.
  //
  array_size = atoi( argv[1] );

  //
  // Get the seed to be used
  //
  seed = atoi( argv[2] );

  //
  // Make sure that CUDA resources get cleaned up on exit.
  //
  atexit( cleanupCuda );

  //
  // Record the start time.
  //
  gettimeofday( &start, NULL );

  //
  // Allocate the array to be populated.
  //
  double *array = (double *) malloc( array_size * sizeof( double ) );

  //
  // Seed the random number generator and populate the array with its values.
  //
  srand( seed );
  for( i = 0; i < array_size; i++ )
    array[i] = ( (double) rand() / (double) RAND_MAX ) * MAXIMUM_VALUE;

  //
  // Setup output variables to hold min, max, mean, and standard deviation
  //
  // YOUR CALCULATIONS BELOW SHOULD POPULATE THESE WITH RESULTS
  //
  double min = DBL_MAX;
  double max = 0;
  double sum = 0;
  double mean = 0;
  double stddev = 0;

  unsigned int threadsPerBlock = 512;
  unsigned int blocksPerGrid = (65535 < (array_size + threadsPerBlock - 1) / threadsPerBlock)?65535:(array_size + threadsPerBlock - 1) / threadsPerBlock;

  double *dev_input_array, *dev_output_array, *outputArray;
  outputArray = (double*)malloc( blocksPerGrid * sizeof(double) );


  //
  // CALCULATE VALUES FOR MIN, MAX, MEAN, and STDDEV - BEGIN
  //
  //Allocate memory on GPU
  HANDLE_ERROR( cudaMalloc( (void**) &dev_input_array, array_size * sizeof( double ) ) );
  HANDLE_ERROR( cudaMalloc( (void**) &dev_output_array, blocksPerGrid * sizeof( double ) ) );
  //Copy data from host to GPU
  HANDLE_ERROR( cudaMemcpy( dev_input_array, array, array_size * sizeof(double), cudaMemcpyHostToDevice ) );
  //Execute kernel on GPU
  kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock>>>(dev_input_array, dev_output_array, array_size);

  //Copy data from GPU to host
  HANDLE_ERROR( cudaMemcpy(outputArray, dev_output_array, blocksPerGrid * sizeof( double ), cudaMemcpyDeviceToHost));
  for(unsigned int j = 0; j < blocksPerGrid; j++){
    sum += outputArray[j];
  }

  //Deallocate memory on GPU
  cudaFree(dev_input_array);
  cudaFree(dev_output_array);

  //
  // CALCULATE VALUES FOR MIN, MAX, MEAN, and STDDEV - END
  //

  //
  // Record the end time.
  //
  gettimeofday( &end, NULL );

  //
  // Calculate the runtime.
  //
  runtime = ( ( end.tv_sec  - start.tv_sec ) * 1000.0 ) + ( ( end.tv_usec - start.tv_usec ) / 1000.0 );

  //
  // Output discoveries from the array.
  //
  printf( "Statistics for array ( %d, %d ):
", array_size, seed );
  printf( "    Minimum = %4.6f, Maximum = %4.6f
", min, max );
  printf( "    Mean = %4.6f, Standard Deviation = %4.6f
", mean, stddev );
  printf( "Processing Time: %4.4f milliseconds
", runtime );

  //
  // Free the allocated array.
  //
  free( array );
  free(outputArray);

  return 0;
}
答案

报告的错误发生在内核调用后的cudaMemcpy操作上:

HANDLE_ERROR( cudaMemcpy(outputArray, dev_output_array, blocksPerGrid * sizeof( double ), cudaMemcpyDeviceToHost));

(将来,如果您确定这是代码中的第167行,则对其他人有帮助)

CUDA内核错误报告是异步的,因此该错误实际上表明内核代码中存在非法内存访问。

如果你用cuda-memcheck运行你的代码,你会看到很多这样的迹象:

========= Invalid __shared__ write of size 8
=========     at 0x00000068 in /.../....cu:64:kernel_calculate_sum(double*, double*, unsigned int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00000000 is out of bounds
...

所以你的共享内存访问有问题。使用动态分配的共享内存时,如下所示:

extern __shared__ double sdata[];

有必要将共享内存的大小作为内核参数传递。但你没有这样做:

kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock>>>(dev_input_array, dev_output_array, array_size);
                                                     ^
                                          missing shared size parameter

所以你的共享内存访问失败了。对内核调用的简单修改:

kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock, threadsPerBlock*sizeof(double)>>>(dev_input_array, dev_output_array, array_size);

解决了这个错误。这是必要的,因为你的内核需要每个线程一个double数量,所以你必须保留你的threadblock的大小乘以double的大小。

以上是关于使用cuda c减少计算数组的总和的主要内容,如果未能解决你的问题,请参考以下文章

如何在 cuda 中获得并行数组的“总和”?

gcc-via-nvcc 是不是矢量化这些总和和最大减少?

使用Cuda平行降维(3D到2D,总和)

CUDA Reduction - 原子与单线程求和

如何使用CUDA并行化嵌套for循环以在2D数组上执行计算

c_cpp 该C程序使用指针计算数组元素的总和。程序使用指针遍历数组并将元素加起来