“cudaOccupancyMaxActiveBlocksPerMultiprocessor” API 返回的活动块数值的意义?

Posted

技术标签:

【中文标题】“cudaOccupancyMaxActiveBlocksPerMultiprocessor” API 返回的活动块数值的意义?【英文标题】:Significance of the number of active blocks value returned by the "cudaOccupancyMaxActiveBlocksPerMultiprocessor" API? 【发布时间】:2021-12-26 17:57:57 【问题描述】:

我正在尝试了解 CUDA 提供的以下两个 API,以最大限度地提高占用率(并优化内核执行时间):

cudaOccupancyMaxActiveBlocksPerMultiprocessor cudaOccupancyMaxPotentialBlockSize

我的设备:

======================DEVICE PROPERTIES==========
Number of Multiprocessors (SMs)      : 16
Number of Cores per SM               : 128
Total number of Cores                : 2048
Max. Threads per SM                  : 2048
Warp Size                            : 32
============================================================

我希望这两个 API 可以帮助我自动找出任何设备上内核的最佳启动参数。使用不同的输入数组大小块大小值,我得到以下结果:

    数组大小 = 1000,块大小 = 10 x 1 x 1

    数组大小 = 32,块大小 = 16 x 1 x 1

    数组大小 = 32768,块大小 = 1024 x 1 x 1

问题:我想了解“活动块数(每个 SM)” 实际代表(或计算)是什么?

我的代码:

源.cpp

#include <iostream>
#include <stdio.h>
#include "header_kernel.cuh"

#define VALUE_TO_BE_ADDED 1

using namespace std;

int main()

    int USER_DEFINED_THREADS_PER_BLOCK = 1;
    int ARRAY_SIZE = 1;


    cout << "\n=========================USER INPUT===============================";
    cout << "\nEnter desired 1D ARRAY_SIZE\t\t\t: ";
    cin >> ARRAY_SIZE;

    std::cout << "Enter desired number of THREADS_PER_BLOCK\t: ";
    cin >> USER_DEFINED_THREADS_PER_BLOCK;

    cout << "====================================================================\n\n";

    //Declare pointers for input and output arrays
    int* in = (int*)calloc(ARRAY_SIZE, sizeof(int));
    int* out = (int*)calloc(ARRAY_SIZE, sizeof(int));

    //Just put some data in the array
    for (int i = 0; i < ARRAY_SIZE; i++)
    
        in[i] = i + 1;
    

    //Gather "User Cofig" Data
    int numActiveBlocksWithUserConfig = GetNumberOfActiveBlocks(USER_DEFINED_THREADS_PER_BLOCK);
    int GridSize_X = (ARRAY_SIZE + USER_DEFINED_THREADS_PER_BLOCK - 1) / USER_DEFINED_THREADS_PER_BLOCK;
    double userConfigOccupancy = ComputeOccupancy(numActiveBlocksWithUserConfig, USER_DEFINED_THREADS_PER_BLOCK);
    float userConfigKernelExecutionTime = ComputeKernelExecutionTime(in, out, VALUE_TO_BE_ADDED, ARRAY_SIZE, USER_DEFINED_THREADS_PER_BLOCK);

    //Gather "Best Possible Config" Data
    int bestBlockSize, bestGridSize;
    GetBestOccupancyParams(ARRAY_SIZE, bestBlockSize, bestGridSize);
    int numActiveBlocksWithBestConfig = GetNumberOfActiveBlocks(bestBlockSize);
    double bestConfigOccupancy = ComputeOccupancy(numActiveBlocksWithBestConfig, bestBlockSize);
    float bestKernelExecutionTime = ComputeKernelExecutionTime(in, out, VALUE_TO_BE_ADDED, ARRAY_SIZE, bestBlockSize);



    //Print - User Config Results
    cout << "\n======================USER==================================\n";
    cout << "Block Size              : " << USER_DEFINED_THREADS_PER_BLOCK << " x 1 x 1" << endl;
    cout << "Grid Size               : " << GridSize_X << " x 1 x 1" << endl;
    cout << "Active Blocks per SM    : " << numActiveBlocksWithUserConfig << endl;
    cout << "Occupancy               : " << userConfigOccupancy << " %" << endl;
    cout << "Kernel Execution Time   : " << userConfigKernelExecutionTime << endl;
    cout << "============================================================\n\n\n";


    //Print - Best Config Results
    cout << "\n======================BEST POSSIBLE (API Computed)==========\n";
    cout << "Block Size              : " << bestBlockSize << " x 1 x 1" << endl;
    cout << "Grid Size               : " << bestGridSize << " x 1 x 1" << endl;
    cout << "Active Blocks per SM    : " << numActiveBlocksWithBestConfig << endl;
    cout << "Occupancy               : " << bestConfigOccupancy << " %" << endl;
    cout << "Kernel Execution Time   : " << bestKernelExecutionTime << endl;
    cout << "============================================================\n\n\n";

    free(in);
    free(out);

    return 0;

header_kernel.cuh

#ifndef KERNELHEADER
#define KERNELHEADER

void GetBestOccupancyParams(const int ARRAY_SIZE, int& bestBlockSize, int& bestGridSize);
float ComputeKernelExecutionTime(int* in, int* out, const int VALUE_TO_BE_ADDED, const int ARRAY_SIZE, const int BLOCK_SIZE);
int GetNumberOfActiveBlocks(const int BLOCK_SIZE);
double ComputeOccupancy(const int numActiveBlocksPerSM, const int BLOCK_SIZE);
void printGPUProperties();

#endif // !KERNELHEADER

kernel.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>

__global__
void UserAdditionKernel(int* d_out, int* d_in, int VALUE_TO_BE_ADDED)

    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    const int x = d_in[i];
    d_out[i] = x + VALUE_TO_BE_ADDED;


void GetBestOccupancyParams(const int ARRAY_SIZE, int& bestBlockSize, int& bestGridSize)

    int blockSize;
    int minGridSize;
    int gridSize;
    size_t dynamicSMemUsage = 0;

    // Returns Grid and Block size that achieve maximum potential occupancy for a device function
    // "minGridSize" and "blockSize" is a grid/block pair that achieves the best potential occupancy (i.e. max. number of active warps with the smallest number of blocks).
    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)UserAdditionKernel, dynamicSMemUsage, ARRAY_SIZE);
    bestBlockSize = blockSize;
    bestGridSize = minGridSize;

    std::cout << "=============SUGGESTED CONFIGURATION===========================\n";
    std::cout << "Suggested block size                     : " << blockSize << std::endl;
    std::cout << "Minimum grid size for maximum occupancy  : " << minGridSize << std::endl;
    std::cout << "=============================================================\n\n\n";


float ComputeKernelExecutionTime(int* in, int* out, const int VALUE_TO_BE_ADDED, const int ARRAY_SIZE, const int BLOCK_SIZE)

    //Decaler pointers to the device arrays
    int* d_in = 0;
    int* d_out = 0;

    //Allocate device memory for input and output arrays
    cudaMalloc(&d_in, ARRAY_SIZE * sizeof(int));
    cudaMalloc(&d_out, ARRAY_SIZE * sizeof(int));

    //Copy INPUT ARRAY data from host to device
    cudaMemcpy(d_in, in, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);

    //-------------------------------------KERNEL configuration-------------------------------------//
    //BLOCK 
    int numThreads_X = BLOCK_SIZE;
    int numThreads_Y = 1;
    int numThreads_Z = 1;
    dim3 userBlockDimension(numThreads_X, numThreads_Y, numThreads_Z);

    //GRID
    // We will keep the grid only in the x-direction
    int numBlocks_X = (ARRAY_SIZE + userBlockDimension.x - 1) / userBlockDimension.x;
    dim3 userGridDimension(numBlocks_X, 1, 1);


    //Compute TIME
    cudaEvent_t startKernelTime, endKernelTime;
    cudaEventCreate(&startKernelTime);
    cudaEventCreate(&endKernelTime);


    //Execute kernel
    cudaEventRecord(startKernelTime);
    UserAdditionKernel << <userGridDimension, userBlockDimension >> > (d_out, d_in, VALUE_TO_BE_ADDED);
    cudaDeviceSynchronize();
    cudaMemcpy(out, d_out, ARRAY_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventRecord(endKernelTime);

    //Ensure time events have stopped
    cudaEventSynchronize(endKernelTime);

    //Time taken
    float elapsedTimeInMs;

    cudaEventElapsedTime(&elapsedTimeInMs, startKernelTime, endKernelTime);
    return elapsedTimeInMs;



int GetNumberOfActiveBlocks(const int BLOCK_SIZE)

    int numActiveBlocksPerSM;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocksPerSM, UserAdditionKernel, BLOCK_SIZE, 0);

    return numActiveBlocksPerSM;


double ComputeOccupancy(const int numActiveBlocksPerSM, const int BLOCK_SIZE)

    int device;
    cudaDeviceProp prop;

    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    int activeWarps = numActiveBlocksPerSM * BLOCK_SIZE / prop.warpSize;
    int maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;

    double occupancy = (double)activeWarps / maxWarps;

    return occupancy * 100;

【问题讨论】:

根据经验,请不要张贴屏幕截图,其中信息可以作为文本发布。无法搜索图像。文字可以。文本也可以方便地复制和粘贴以获得答案,同样图像不能。 @talonmies:感谢您指出。我放了输出的截图只是因为我需要将其中的一些部分标记为红色。此外,在这种情况下,输出不包含任何可搜索的信息,但确保它们可以被复制粘贴到答案中,所以我以后会记住这一点(并编辑我的问题):) 【参考方案1】:

参见https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#hardware-implementation,“4 硬件实现”:

...一个线程块的线程在一个多处理器上并发执行...

因此,当您指定块大小、内核、共享内存量等时,CUDA 会计算块中所有线程使用的所需寄存器数量以及其他资源需求;而一个SM只有这么多。此外,IIRC 对可以在单个 SM 上安排的经线数量有硬性限制,可能还有其他一些限制。重要的是,如果以这样的方式指定块,SM 找不到可用的 warp 来执行,SM 就会停止,并且执行的 warp 越少,它停止的次数就越多(概率上)。

块大小越小(但我猜不少于 128 个线程),单个 SM 可以执行的块越多,但是 IIRC 对每个 SM 的块数有另一个硬性限制。

最后,是的,它很复杂,我不记得很多细节。实际上它非常复杂,以至于 NVidia 在 CUDA 的整个生命周期中都通过不同的方式分发了许多占用计算器,包括 Excel 电子表格。

【讨论】:

以上是关于“cudaOccupancyMaxActiveBlocksPerMultiprocessor” API 返回的活动块数值的意义?的主要内容,如果未能解决你的问题,请参考以下文章