无法使 cublasSgelsbatched 函数工作

Posted

技术标签:

【中文标题】无法使 cublasSgelsbatched 函数工作【英文标题】:Not able to get the cublasSgelsbatched function to work 【发布时间】:2021-07-25 16:39:31 【问题描述】:

我目前正在尝试使 cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) 版本正常工作。我首先制作了一个小测试用例,以查看确切需要哪些参数以及如何输入它们。然而,经过多次试验和错误,我仍然无法让它工作,我得到 13 的状态返回,这对应于 CUBLAS_STATUS_EXECUTION_FAILED 这是一个非常模糊的错误,我还尝试了一些其他 cublas 测试用例,它们似乎工作正常。我还在 MATlab 中测试了输入矩阵,它确实有 LS 解决方案。

#include "stdafx.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"

#include <algorithm>
#include <cmath>
#include <Windows.h>

int main()

    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cublasHandle_t m_cuBLAS;
    cublasStatus_t stat;

    // create handle
    stat = cublasCreate(&m_cuBLAS);

    //params
    const int C = 3; 
    const int M = 2;
    long lda = C;
    long ldb = M;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat;

    //allocate mem
    Amat = (float*) malloc(M * C * sizeof(float));
    Ymat = (float*) malloc(C *  sizeof(float));
    Xmat = (float*) malloc(M *  sizeof(float));

    srand(100);

    for (int i = 0; i < C * M; i++) 
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];
    

    for (int i = 0; i < C; i++) 
        Ymat[i] =  rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    

    //allocate mem
    cudaMalloc( &gAmat, M * C * sizeof(float));
    cudaMalloc( &gYmat, C * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);

    //init info params
    int info = 0;
    int devInfoArray[1] =  0 ;

    //Synchronize (not necesarry I think, but just to test)
    cudaDeviceSynchronize();
    

    //run cublas
    cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
        CUBLAS_OP_N,
        C,
        M,
        1,
        &gAmat,
        lda, //or 1
        &gYmat,
        lda,
        &info,
        NULL,
        1);
    
    //Output info
    std::cout << "status = " << status << std::endl;
    std::cout << "info = " << info << std::endl;
    std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;

    cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);

    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;

    //free memory
    free(Amat);
    free(Ymat);
    free(Xmat);

    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cublasDestroy(m_cuBLAS);

    return 0;


我在使用 CUDA 9.0 在 MVS 中运行的 Windows 10 上

非常感谢您的帮助

【问题讨论】:

docs.nvidia.com/cuda/cublas/… -- “Aarray 是指向以列优先格式存储的矩阵的指针数组”。我在您的代码中没有看到任何指向矩阵的指针数组,是吗? 【参考方案1】:

正如 cmets 中所指出的,您没有在设备上创建正确的指针数组。 batched function 使用指针数组存在于设备内存中,用于数据参数,例如:

Aarray device 指向数组的指针的输入/输出数组,每个数组都是暗淡的。 m x n 与 lda>=max(1,m)。矩阵 Aarray[i] 不应重叠;否则,会出现未定义的行为。

传递例如&amp;gAmat 似乎满足类型要求,但该指针不指向设备内存

对您的代码的以下修改侧重于正确处理 gAmatgYmat 对我来说似乎运行没有错误:

$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <algorithm>
#include <cmath>

int main()

    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cublasHandle_t m_cuBLAS;
    cublasStatus_t stat;

    // create handle
    stat = cublasCreate(&m_cuBLAS);

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;
    long ldb = M;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat;

    //allocate mem
    Amat = (float*) malloc(M * C * sizeof(float));
    Ymat = (float*) malloc(C *  sizeof(float));
    Xmat = (float*) malloc(M *  sizeof(float));

    srand(100);

    for (int i = 0; i < C * M; i++) 
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];
    

    for (int i = 0; i < C; i++) 
        Ymat[i] =  rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    

    //allocate mem
    cudaMalloc( &gAmat, M * C * sizeof(float));
    cudaMalloc( &gYmat, C * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
    float **ggAmat, **ggYmat;
    cudaMalloc(&ggAmat, sizeof(float*));
    cudaMalloc(&ggYmat, sizeof(float*));
    cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
    cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
    //init info params
    int info = 0;
    int devInfoArray[1] =  0 ;

    //Synchronize (not necesarry I think, but just to test)
    cudaDeviceSynchronize();


    //run cublas
    cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
        CUBLAS_OP_N,
        C,
        M,
        1,
        ggAmat,
        lda, //or 1
        ggYmat,
        lda,
        &info,
        NULL,
        1);

    //Output info
    std::cout << "status = " << status << std::endl;
    std::cout << "info = " << info << std::endl;
    std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;

    cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);

    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;

    //free memory
    free(Amat);
    free(Ymat);
    free(Xmat);

    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cublasDestroy(m_cuBLAS);

    return 0;

$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used

t130.cu(24): warning: variable "ldb" was declared but never referenced

$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$

您的代码仅显示一个数组。如果您有一批数组,您将为 A 和 Y 中的每一个传递一个实际的设备分配指针数组。

基于下面的 cmets,这里是使用非随机输入的代码版本:

$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <algorithm>
#include <cmath>

int main()

    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cublasHandle_t m_cuBLAS;
    cublasStatus_t status;
    // create handle
    status = cublasCreate(&m_cuBLAS);
    std::cout << "status = " << status << std::endl;

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat;

    //allocate mem
    Amat = (float*) malloc(M * C * sizeof(float));
    Ymat = (float*) malloc(C *  sizeof(float));
    Xmat = (float*) malloc(M *  sizeof(float));

    srand(100);
#if 0
    for (int i = 0; i < C * M; i++) 
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];
    

    for (int i = 0; i < C; i++) 
        Ymat[i] =  rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    
#endif
    Amat[0] = 6;
    Amat[1] = 7;
    Amat[2] = 6;
    Amat[3] = 5;
    Amat[4] = 5;
    Amat[5] = 5;
    Ymat[0] = 9;
    Ymat[1] = 3;
    Ymat[2] = 10;
    //allocate mem
    cudaMalloc( &gAmat, M * C * sizeof(float));
    cudaMalloc( &gYmat, C * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
    float **ggAmat, **ggYmat;
    cudaMalloc(&ggAmat, sizeof(float*));
    cudaMalloc(&ggYmat, sizeof(float*));
    cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
    cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
    //init info params
    int info = 0;
    int devInfoArray[1] =  0 ;

    //Synchronize (not necesarry I think, but just to test)
    cudaDeviceSynchronize();


    //run cublas
    status = cublasSgelsBatched(m_cuBLAS,
        CUBLAS_OP_N,
        C,
        M,
        1,
        ggAmat,
        lda, //or 1
        ggYmat,
        lda,
        &info,
        NULL,
        1);

    //Output info
    std::cout << "status = " << status << std::endl;
    std::cout << "info = " << info << std::endl;
    std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;

    cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);

    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;

    //free memory
    free(Amat);
    free(Ymat);
    free(Xmat);

    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cublasDestroy(m_cuBLAS);

    return 0;

$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$

【讨论】:

感谢您的反应,它现在运行,比以前好多了,但是当我运行与您相同的代码时,我得到一个状态 13 响应,而且我得到了错误的答案。 Matlab 告诉我结果应该是 [-6.5, 9.7]。与A = [6, 7, 6, 5, 5, 5]Y = [9, 3, 10]; 那么你认为这与安装有关吗? 当我使用您建议的 A 和 Y 值运行时,我得到的前 2 个输出值确实是 -6.5 和 9.7,并且报告的状态为零。因此,如果您遇到错误,那么我认为您的设置可能有问题。如果您安装了新的 CUDA,通常最好使用一个或多个示例代码来验证操作。我已经使用非随机值更新了这个测试的答案。

以上是关于无法使 cublasSgelsbatched 函数工作的主要内容,如果未能解决你的问题,请参考以下文章

无法使 PHP 函数在 PHP 内部的 HTML 中工作

如何使R中无法访问全局环境中用户定义函数的源代码? (用于教育目的)

无法使功能中的影片剪辑消失

无法使 onClick 在 ReactJS 中工作

在 IBAction 中调用其他函数之前,如何使函数完成?

如何使函数递归