使用cuda c减少计算数组的总和

Question

我遇到的一个问题是，当我运行我的cuda程序时，总是说在第167行的stats_gpu.cu中遇到了非法的内存访问。

代码如下：

#include <stdio.h>
#include <float.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
#include <cuda.h>

#define MAXIMUM_VALUE   1000000.0f
#define HANDLE_ERROR( err )  ( HandleError( err, __FILE__, __LINE__ ) )

void HandleError( cudaError_t err, const char *file, int line ) {
  //
  // Handle and report on CUDA errors.
  //
  if ( err != cudaSuccess ) {
    printf( "%s in %s at line %d
", cudaGetErrorString( err ), file, line );

    exit( EXIT_FAILURE );
  }
}

void checkCUDAError( const char *msg, bool exitOnError ) {
  //
  // Check cuda error and print result if appropriate.
  //
  cudaError_t err = cudaGetLastError();

  if( cudaSuccess != err) {
      fprintf(stderr, "Cuda error: %s: %s.
", msg, cudaGetErrorString(err) );
      if (exitOnError) {
        exit(-1);
      }
  }
}

void cleanupCuda( void ) {
  //
  // Clean up CUDA resources.
  //

  //
  // Explicitly cleans up all runtime-related resources associated with the
  // calling host thread.
  //
  HANDLE_ERROR(
         cudaThreadExit()
         );
}

__device__ double device_pow( double x, double y ) {
  //
  // Calculate x^y on the GPU.
  //
  return pow( x, y );
}

//
// PLACE GPU KERNELS HERE - BEGIN
//
__global__ void kernel_calculate_sum(double *inputArray, double *outputArray, unsigned int n){
    extern __shared__ double sdata[];
    unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int tid = threadIdx.x;
    sdata[tid] = 0;
    while(i < n){
        sdata[tid] += inputArray[i];
        i += blockDim.x * gridDim.x;
    }
    __syncthreads();
    for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
        if(tid < s){
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if(tid == 0){
        outputArray[blockIdx.x] = sdata[0];
    }

}

//
// PLACE GPU KERNELS HERE - END
//

int main( int argc, char* argv[] ) {
  //
  // Determine min, max, mean, mode and standard deviation of array
  //
  unsigned int array_size, seed, i;
  struct timeval start, end;
  float runtime;

  if( argc < 3 ) {
    printf( "Format: stats_gpu <size of array> <random seed>
" );
    printf( "Arguments:
" );
    printf( "  size of array - This is the size of the array to be generated and processed
" );
    printf( "  random seed   - This integer will be used to seed the random number
" );
    printf( "                  generator that will generate the contents of the array
" );
    printf( "                  to be processed
" );

    exit( 1 );
  }

  //
  // Get the size of the array to process.
  //
  array_size = atoi( argv[1] );

  //
  // Get the seed to be used
  //
  seed = atoi( argv[2] );

  //
  // Make sure that CUDA resources get cleaned up on exit.
  //
  atexit( cleanupCuda );

  //
  // Record the start time.
  //
  gettimeofday( &start, NULL );

  //
  // Allocate the array to be populated.
  //
  double *array = (double *) malloc( array_size * sizeof( double ) );

  //
  // Seed the random number generator and populate the array with its values.
  //
  srand( seed );
  for( i = 0; i < array_size; i++ )
    array[i] = ( (double) rand() / (double) RAND_MAX ) * MAXIMUM_VALUE;

  //
  // Setup output variables to hold min, max, mean, and standard deviation
  //
  // YOUR CALCULATIONS BELOW SHOULD POPULATE THESE WITH RESULTS
  //
  double min = DBL_MAX;
  double max = 0;
  double sum = 0;
  double mean = 0;
  double stddev = 0;

  unsigned int threadsPerBlock = 512;
  unsigned int blocksPerGrid = (65535 < (array_size + threadsPerBlock - 1) / threadsPerBlock)?65535:(array_size + threadsPerBlock - 1) / threadsPerBlock;

  double *dev_input_array, *dev_output_array, *outputArray;
  outputArray = (double*)malloc( blocksPerGrid * sizeof(double) );


  //
  // CALCULATE VALUES FOR MIN, MAX, MEAN, and STDDEV - BEGIN
  //
  //Allocate memory on GPU
  HANDLE_ERROR( cudaMalloc( (void**) &dev_input_array, array_size * sizeof( double ) ) );
  HANDLE_ERROR( cudaMalloc( (void**) &dev_output_array, blocksPerGrid * sizeof( double ) ) );
  //Copy data from host to GPU
  HANDLE_ERROR( cudaMemcpy( dev_input_array, array, array_size * sizeof(double), cudaMemcpyHostToDevice ) );
  //Execute kernel on GPU
  kernel_calculate_sum<<<blocksPerGrid, threadsPerBlock>>>(dev_input_array, dev_output_array, array_size);

  //Copy data from GPU to host
  HANDLE_ERROR( cudaMemcpy(outputArray, dev_output_array, blocksPerGrid * sizeof( double ), cudaMemcpyDeviceToHost));
  for(unsigned int j = 0; j < blocksPerGrid; j++){
    sum += outputArray[j];
  }

  //Deallocate memory on GPU
  cudaFree(dev_input_array);
  cudaFree(dev_output_array);

  //
  // CALCULATE VALUES FOR MIN, MAX, MEAN, and STDDEV - END
  //

  //
  // Record the end time.
  //
  gettimeofday( &end, NULL );

  //
  // Calculate the runtime.
  //
  runtime = ( ( end.tv_sec  - start.tv_sec ) * 1000.0 ) + ( ( end.tv_usec - start.tv_usec ) / 1000.0 );

  //
  // Output discoveries from the array.
  //
  printf( "Statistics for array ( %d, %d ):
", array_size, seed );
  printf( "    Minimum = %4.6f, Maximum = %4.6f
", min, max );
  printf( "    Mean = %4.6f, Standard Deviation = %4.6f
", mean, stddev );
  printf( "Processing Time: %4.4f milliseconds
", runtime );

  //
  // Free the allocated array.
  //
  free( array );
  free(outputArray);

  return 0;
}