CUDA[4] sample program: matrix-vector multiplication
Posted Pentium.Labs
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA[4] sample program: matrix-vector multiplication相关的知识,希望对你有一定的参考价值。
Use Compressed Sparse Row Format (CSR) to represent matrix
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include "gputimer.h" 4 #include<stdio.h> 5 #include<stdlib.h> 6 #include<string.h> 7 #define WARP_SIZE 32 8 9 __global__ void 10 spmv_csr_vector_kernel ( const int num_rows , 11 const int * ptr , 12 const int * indices , 13 const double * data , 14 const double * x , 15 double * y) 16 { 17 __shared__ double vals [WARP_SIZE]; 18 int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index 19 int warp_id = thread_id / WARP_SIZE; // global warp index 20 int lane = thread_id & (WARP_SIZE - 1); // thread index within the warp 21 // one warp per row 22 int row = warp_id ; 23 if ( row < num_rows ) 24 { 25 int row_start = ptr [ row ]; 26 int row_end = ptr [ row +1]; 27 // compute running sum per thread 28 vals [ threadIdx.x ] = 0; 29 for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE) 30 vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]]; 31 // parallel reduction in shared memory 32 if ( lane < 16) vals [ threadIdx.x ] += vals [ threadIdx.x + 16]; 33 if ( lane < 8) vals [ threadIdx.x ] += vals [ threadIdx.x + 8]; 34 if ( lane < 4) vals [ threadIdx.x ] += vals [ threadIdx.x + 4]; 35 if ( lane < 2) vals [ threadIdx.x ] += vals [ threadIdx.x + 2]; 36 if ( lane < 1) vals [ threadIdx.x ] += vals [ threadIdx.x + 1]; 37 // first thread writes the result 38 if ( lane == 0) 39 y[ row ] += vals [ threadIdx.x ]; 40 } 41 } 42 43 __global__ void 44 spmv_csr_scalar_kernel ( const int num_rows , 45 const int * ptr , 46 const int * indices , 47 const double * data , 48 const double * x , 49 double * y) 50 { 51 int row = blockDim.x * blockIdx.x + threadIdx.x ; 52 if( row < num_rows ) 53 { 54 double dot = 0; 55 int row_start = ptr [ row ]; 56 int row_end = ptr [ row +1]; 57 for (int jj = row_start ; jj < row_end ; jj ++) 58 dot += data [ jj ] * x[ indices [ jj ]]; 59 y[ row ] += dot ; 60 } 61 } 62 63 int main(int argc,char **argv) 64 { 65 double h_data[32]={1,4,2,3,5,7,8,9,6}; 66 int h_col[32]={0,1,1,2,0,3,4,2,4}; 67 int h_ptr[32]={0,2,4,7,9}; 68 double h_x[32]={1,2,3,4,5}; 69 double h_y[32]={0,0,0,0}; 70 int num_rows=4; 71 72 double *d_data; 73 int *d_col; 74 int *d_ptr; 75 double *d_x; 76 double *d_y; 77 78 cudaMalloc((void**) &d_data,sizeof(double)*32); 79 cudaMalloc((void**) &d_col,sizeof(int)*32); 80 cudaMalloc((void**) &d_ptr,sizeof(int)*32); 81 cudaMalloc((void**) &d_x,sizeof(double)*32); 82 cudaMalloc((void**) &d_y,sizeof(double)*32); 83 cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*32, cudaMemcpyHostToDevice); 84 cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*32, cudaMemcpyHostToDevice); 85 cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*32, cudaMemcpyHostToDevice); 86 cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*32, cudaMemcpyHostToDevice); 87 cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*32, cudaMemcpyHostToDevice); 88 89 GpuTimer timer; 90 timer.Start(); 91 spmv_csr_vector_kernel<<<num_rows,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y); 92 //spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y); 93 timer.Stop(); 94 printf("Duration: %g ms\\n",timer.Elapsed()); 95 96 cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*32, cudaMemcpyDeviceToHost); 97 98 for(int i=0;i<num_rows;i++) 99 printf("%.5f ",h_y[i]); 100 printf("\\n"); 101 102 return 0; 103 }
ref:
http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf
ch4.3
以上是关于CUDA[4] sample program: matrix-vector multiplication的主要内容,如果未能解决你的问题,请参考以下文章
Professional CUDA C Programming的代码实例1.1
CUDA C Programming Guide 在线教程学习笔记 Part 3
CUDA教程How CUDA Programming Works - 知识点目录
professional cuda c programming--CUDA库简单介绍
CUDA Intro to Parallel Programming笔记--Lesson 1 The GPU Programming Model