“cudaOccupancyMaxActiveBlocksPerMultiprocessor” API 返回的活动块数值的意义?
Posted
技术标签:
【中文标题】“cudaOccupancyMaxActiveBlocksPerMultiprocessor” API 返回的活动块数值的意义?【英文标题】:Significance of the number of active blocks value returned by the "cudaOccupancyMaxActiveBlocksPerMultiprocessor" API? 【发布时间】:2021-12-26 17:57:57 【问题描述】:我正在尝试了解 CUDA 提供的以下两个 API,以最大限度地提高占用率(并优化内核执行时间):
cudaOccupancyMaxActiveBlocksPerMultiprocessor cudaOccupancyMaxPotentialBlockSize我的设备:
======================DEVICE PROPERTIES==========
Number of Multiprocessors (SMs) : 16
Number of Cores per SM : 128
Total number of Cores : 2048
Max. Threads per SM : 2048
Warp Size : 32
============================================================
我希望这两个 API 可以帮助我自动找出任何设备上内核的最佳启动参数。使用不同的输入数组大小和块大小值,我得到以下结果:
数组大小 = 1000,块大小 = 10 x 1 x 1
数组大小 = 32,块大小 = 16 x 1 x 1
数组大小 = 32768,块大小 = 1024 x 1 x 1
问题:我想了解“活动块数(每个 SM)” 实际代表(或计算)是什么?
我的代码:
源.cpp
#include <iostream>
#include <stdio.h>
#include "header_kernel.cuh"
#define VALUE_TO_BE_ADDED 1
using namespace std;
int main()
int USER_DEFINED_THREADS_PER_BLOCK = 1;
int ARRAY_SIZE = 1;
cout << "\n=========================USER INPUT===============================";
cout << "\nEnter desired 1D ARRAY_SIZE\t\t\t: ";
cin >> ARRAY_SIZE;
std::cout << "Enter desired number of THREADS_PER_BLOCK\t: ";
cin >> USER_DEFINED_THREADS_PER_BLOCK;
cout << "====================================================================\n\n";
//Declare pointers for input and output arrays
int* in = (int*)calloc(ARRAY_SIZE, sizeof(int));
int* out = (int*)calloc(ARRAY_SIZE, sizeof(int));
//Just put some data in the array
for (int i = 0; i < ARRAY_SIZE; i++)
in[i] = i + 1;
//Gather "User Cofig" Data
int numActiveBlocksWithUserConfig = GetNumberOfActiveBlocks(USER_DEFINED_THREADS_PER_BLOCK);
int GridSize_X = (ARRAY_SIZE + USER_DEFINED_THREADS_PER_BLOCK - 1) / USER_DEFINED_THREADS_PER_BLOCK;
double userConfigOccupancy = ComputeOccupancy(numActiveBlocksWithUserConfig, USER_DEFINED_THREADS_PER_BLOCK);
float userConfigKernelExecutionTime = ComputeKernelExecutionTime(in, out, VALUE_TO_BE_ADDED, ARRAY_SIZE, USER_DEFINED_THREADS_PER_BLOCK);
//Gather "Best Possible Config" Data
int bestBlockSize, bestGridSize;
GetBestOccupancyParams(ARRAY_SIZE, bestBlockSize, bestGridSize);
int numActiveBlocksWithBestConfig = GetNumberOfActiveBlocks(bestBlockSize);
double bestConfigOccupancy = ComputeOccupancy(numActiveBlocksWithBestConfig, bestBlockSize);
float bestKernelExecutionTime = ComputeKernelExecutionTime(in, out, VALUE_TO_BE_ADDED, ARRAY_SIZE, bestBlockSize);
//Print - User Config Results
cout << "\n======================USER==================================\n";
cout << "Block Size : " << USER_DEFINED_THREADS_PER_BLOCK << " x 1 x 1" << endl;
cout << "Grid Size : " << GridSize_X << " x 1 x 1" << endl;
cout << "Active Blocks per SM : " << numActiveBlocksWithUserConfig << endl;
cout << "Occupancy : " << userConfigOccupancy << " %" << endl;
cout << "Kernel Execution Time : " << userConfigKernelExecutionTime << endl;
cout << "============================================================\n\n\n";
//Print - Best Config Results
cout << "\n======================BEST POSSIBLE (API Computed)==========\n";
cout << "Block Size : " << bestBlockSize << " x 1 x 1" << endl;
cout << "Grid Size : " << bestGridSize << " x 1 x 1" << endl;
cout << "Active Blocks per SM : " << numActiveBlocksWithBestConfig << endl;
cout << "Occupancy : " << bestConfigOccupancy << " %" << endl;
cout << "Kernel Execution Time : " << bestKernelExecutionTime << endl;
cout << "============================================================\n\n\n";
free(in);
free(out);
return 0;
header_kernel.cuh
#ifndef KERNELHEADER
#define KERNELHEADER
void GetBestOccupancyParams(const int ARRAY_SIZE, int& bestBlockSize, int& bestGridSize);
float ComputeKernelExecutionTime(int* in, int* out, const int VALUE_TO_BE_ADDED, const int ARRAY_SIZE, const int BLOCK_SIZE);
int GetNumberOfActiveBlocks(const int BLOCK_SIZE);
double ComputeOccupancy(const int numActiveBlocksPerSM, const int BLOCK_SIZE);
void printGPUProperties();
#endif // !KERNELHEADER
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
__global__
void UserAdditionKernel(int* d_out, int* d_in, int VALUE_TO_BE_ADDED)
const int i = blockIdx.x * blockDim.x + threadIdx.x;
const int x = d_in[i];
d_out[i] = x + VALUE_TO_BE_ADDED;
void GetBestOccupancyParams(const int ARRAY_SIZE, int& bestBlockSize, int& bestGridSize)
int blockSize;
int minGridSize;
int gridSize;
size_t dynamicSMemUsage = 0;
// Returns Grid and Block size that achieve maximum potential occupancy for a device function
// "minGridSize" and "blockSize" is a grid/block pair that achieves the best potential occupancy (i.e. max. number of active warps with the smallest number of blocks).
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)UserAdditionKernel, dynamicSMemUsage, ARRAY_SIZE);
bestBlockSize = blockSize;
bestGridSize = minGridSize;
std::cout << "=============SUGGESTED CONFIGURATION===========================\n";
std::cout << "Suggested block size : " << blockSize << std::endl;
std::cout << "Minimum grid size for maximum occupancy : " << minGridSize << std::endl;
std::cout << "=============================================================\n\n\n";
float ComputeKernelExecutionTime(int* in, int* out, const int VALUE_TO_BE_ADDED, const int ARRAY_SIZE, const int BLOCK_SIZE)
//Decaler pointers to the device arrays
int* d_in = 0;
int* d_out = 0;
//Allocate device memory for input and output arrays
cudaMalloc(&d_in, ARRAY_SIZE * sizeof(int));
cudaMalloc(&d_out, ARRAY_SIZE * sizeof(int));
//Copy INPUT ARRAY data from host to device
cudaMemcpy(d_in, in, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
//-------------------------------------KERNEL configuration-------------------------------------//
//BLOCK
int numThreads_X = BLOCK_SIZE;
int numThreads_Y = 1;
int numThreads_Z = 1;
dim3 userBlockDimension(numThreads_X, numThreads_Y, numThreads_Z);
//GRID
// We will keep the grid only in the x-direction
int numBlocks_X = (ARRAY_SIZE + userBlockDimension.x - 1) / userBlockDimension.x;
dim3 userGridDimension(numBlocks_X, 1, 1);
//Compute TIME
cudaEvent_t startKernelTime, endKernelTime;
cudaEventCreate(&startKernelTime);
cudaEventCreate(&endKernelTime);
//Execute kernel
cudaEventRecord(startKernelTime);
UserAdditionKernel << <userGridDimension, userBlockDimension >> > (d_out, d_in, VALUE_TO_BE_ADDED);
cudaDeviceSynchronize();
cudaMemcpy(out, d_out, ARRAY_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(endKernelTime);
//Ensure time events have stopped
cudaEventSynchronize(endKernelTime);
//Time taken
float elapsedTimeInMs;
cudaEventElapsedTime(&elapsedTimeInMs, startKernelTime, endKernelTime);
return elapsedTimeInMs;
int GetNumberOfActiveBlocks(const int BLOCK_SIZE)
int numActiveBlocksPerSM;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numActiveBlocksPerSM, UserAdditionKernel, BLOCK_SIZE, 0);
return numActiveBlocksPerSM;
double ComputeOccupancy(const int numActiveBlocksPerSM, const int BLOCK_SIZE)
int device;
cudaDeviceProp prop;
cudaGetDevice(&device);
cudaGetDeviceProperties(&prop, device);
int activeWarps = numActiveBlocksPerSM * BLOCK_SIZE / prop.warpSize;
int maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
double occupancy = (double)activeWarps / maxWarps;
return occupancy * 100;
【问题讨论】:
根据经验,请不要张贴屏幕截图,其中信息可以作为文本发布。无法搜索图像。文字可以。文本也可以方便地复制和粘贴以获得答案,同样图像不能。 @talonmies:感谢您指出。我放了输出的截图只是因为我需要将其中的一些部分标记为红色。此外,在这种情况下,输出不包含任何可搜索的信息,但确保它们可以被复制粘贴到答案中,所以我以后会记住这一点(并编辑我的问题):) 【参考方案1】:参见https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#hardware-implementation,“4 硬件实现”:
...一个线程块的线程在一个多处理器上并发执行...
因此,当您指定块大小、内核、共享内存量等时,CUDA 会计算块中所有线程使用的所需寄存器数量以及其他资源需求;而一个SM只有这么多。此外,IIRC 对可以在单个 SM 上安排的经线数量有硬性限制,可能还有其他一些限制。重要的是,如果以这样的方式指定块,SM 找不到可用的 warp 来执行,SM 就会停止,并且执行的 warp 越少,它停止的次数就越多(概率上)。
块大小越小(但我猜不少于 128 个线程),单个 SM 可以执行的块越多,但是 IIRC 对每个 SM 的块数有另一个硬性限制。
最后,是的,它很复杂,我不记得很多细节。实际上它非常复杂,以至于 NVidia 在 CUDA 的整个生命周期中都通过不同的方式分发了许多占用计算器,包括 Excel 电子表格。
【讨论】:
以上是关于“cudaOccupancyMaxActiveBlocksPerMultiprocessor” API 返回的活动块数值的意义?的主要内容,如果未能解决你的问题,请参考以下文章