CUDA -- 规约求矩阵的行和

Posted chen9510

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CUDA -- 规约求矩阵的行和相关的知识,希望对你有一定的参考价值。

 

  求矩阵每行的和?

  可以把每行放入一个不同线程块,这样行与行之间进行粗粒度的并行。而对于每行,其对应的线程块中分配n个线程(对应行宽),使用共享存储器,让每个线程从显存中读取一个数至shared memory中,然后使用规约算法计算和。

 

代码如下:

#include "cuda_runtime.h" //CUDA运行时API
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>

cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d);

__global__ void addKernel(int *mat, int *ans, size_t pitch)

    int bid = blockIdx.x;
    int tid = threadIdx.x;
    __shared__ int data[8];
    int *row = (int*)((char*)mat + bid*pitch);
    data[tid] = row[tid];
    __syncthreads();
    for (int i = 4; i > 0; i /= 2) 
        if (tid < i)
            data[tid] = data[tid] + data[tid + i];
        __syncthreads();
    
    if (tid == 0)
        ans[bid] = data[0];


int main()

    const int row = 4;
    const int col = 8;
    dim3 d(col, row);
    int mat[row][col] =  1,2,3,4,5,1,2,3,
                        6,7,8,9,10,4,5,6,
                        11,12,13,14,15,7,8,9,
                        16,17,18,19,20,10,11,12 ;
    int ans[row];
    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(mat, ans, d);
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "addWithCuda failed!\n");
        return 1;
    
    // cudaThreadExit must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaThreadExit();
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaThreadExit failed!");
        return 1;
    
    for (int i = 0; i < d.y; i++)
    
        std::cout << ans[i] << " ";
    
    return 0;



// 重点理解这个函数
cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d)

    int *dev_mat = 0; //GPU设备端数据指针
    int *dev_ans = 0;
    int pitch;
    cudaError_t cudaStatus; //状态指示
                            // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0); //选择运行平台
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
        goto Error;
    
    // 分配GPU设备端内存
    cudaStatus = cudaMallocPitch((void**)&dev_mat, (size_t *)&pitch, d.x * sizeof(int), d.y);
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaMalloc failed!\n");
        goto Error;
    
    cudaStatus = cudaMalloc((void**)&dev_ans, d.y * sizeof(int));
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaMalloc failed!\n");
        goto Error;
    
    // 拷贝数据到GPU
    cudaStatus = cudaMemcpy2D(dev_mat, pitch, mat, d.x*sizeof(int), d.x*sizeof(int), d.y, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaMemcpy for dev_mat failed!\n");
        goto Error;
    
    cudaStatus = cudaMemcpy(dev_ans, ans, d.y * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaMemcpy for dev_ans failed!\n");
        goto Error;
    
    // 运行核函数
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    addKernel<<<d.y,d.x>>>(dev_mat, dev_ans, pitch);
    //addKernel_thd << <1, size >> >(dev_c, dev_a, dev_b);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float tm;
    cudaEventElapsedTime(&tm, start, stop);
    printf("GPU Elapsed time:%.6f ms.\n", tm);
    // cudaThreadSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaThreadSynchronize(); //同步线程
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(ans, dev_ans, d.y * sizeof(int), cudaMemcpyDeviceToHost); //拷贝结果回主机
    if (cudaStatus != cudaSuccess)
    
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    
Error:
    cudaFree(dev_mat); //释放GPU设备端内存
    cudaFree(dev_ans);
    return cudaStatus;

 

以上是关于CUDA -- 规约求矩阵的行和的主要内容,如果未能解决你的问题,请参考以下文章

matlab怎样通过for循环语句找出两矩阵相同的行和列?

在MATLAB中,如何找出矩阵的非零元素。并且输出它所在的行和列。只要非零就为真。输出1。。

matlab 求出一维矩阵中最小值,且求出该最小值在矩阵中的位置,求各位帮帮忙

确定是不是可以通过翻转 1 矩阵的行和列来达到给定二进制矩阵的算法

急求!VB编程问题:有一个n×m的矩阵,编写程序,找出其中最大的元素所在的行和列,并输出其值及行号和列

如何并行化将矩阵的行随机复制到内存中的另一个矩阵的过程? [复制]