CUDA 实例练习
Posted Jason&Hymer
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA 实例练习相关的知识,希望对你有一定的参考价值。
题目:多项式f(x)=x + x^2 + x^3 + ···+x^n求和。
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <math.h> #include <stdio.h> #include <gputimer.h> __global__ void polynomial_items(float * array, float x, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; //float sum = 0.0f; if (idx < n) { array[idx] = pow(x, idx + 1); } __syncthreads(); //for (int i = 0; i <= idx; i++){ //sum += array[idx]; //} } __global__ void shmem_reduce_kernel(float * d_out, const float * d_in) { extern __shared__ float sdata[]; int myId = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x; // load shared mem from global mem sdata[tid] = d_in[myId]; __syncthreads(); // do reduction in shared mem for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads();// make sure all adds at one stage are done! } if (tid == 0)// only thread 0 writes result for this block back to global mem { d_out[blockIdx.x] = sdata[0]; } } void reduce(float * d_out, float * d_intermediate, float * d_in,int size) { // assumes that size is not greater than maxThreadsPerBlock^2 // and that size is a multiple of maxThreadsPerBlock const int maxThreadsPerBlock = 1024; int threads = maxThreadsPerBlock; int blocks = size / maxThreadsPerBlock + 1; shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >( d_intermediate, d_in); // now we‘re down to one block left, so reduce it threads = blocks;// launch one thread for each block in prev step blocks = 1; shmem_reduce_kernel << <blocks, threads, threads*sizeof(float) >> >( d_out, d_intermediate); } int main() { GpuTimer timer; int n; float x; float sum = 0.0f; printf("n is "); scanf("%d", &n); printf("x is "); scanf("%f", &x); float * h_in; h_in = (float *)malloc(sizeof(float)*n); float * d_array; const int array_bytes = n* sizeof(float); cudaMalloc((void **)&d_array, array_bytes); float *d_in, *d_intermediate, *d_out; cudaMalloc((void **)&d_in, array_bytes); cudaMalloc((void **)&d_intermediate, array_bytes); cudaMalloc((void **)&d_out, sizeof(float)); float h_out; timer.Start(); polynomial_items << <n / 1024 + 1, 1024 >> >(d_array, x, n); cudaMemcpy(h_in, d_array, array_bytes, cudaMemcpyDeviceToHost); cudaMemcpy(d_in, h_in, array_bytes, cudaMemcpyHostToDevice); reduce(d_out, d_intermediate, d_in, n); //for (int i = 0; i <n; i++){ //sum += h_in[i]; //} timer.Stop(); cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost); for (int i = 0; i < n; i++){ printf("%f ", h_in[i]); } printf("\nTime elapsed = %g ms\n", timer.Elapsed()); //printf("sum is %lf\n", sum); printf("h_out is %lf\n", h_out); cudaFree(d_array); cudaFree(d_in); cudaFree(d_intermediate); cudaFree(d_out); return 0; }
以上是关于CUDA 实例练习的主要内容,如果未能解决你的问题,请参考以下文章
RuntimeError: ‘lengths’ argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor(代码片段