CUPTI / CUDA prof_trigger 未按预期工作
Posted
技术标签:
【中文标题】CUPTI / CUDA prof_trigger 未按预期工作【英文标题】:CUPTI / CUDA prof_trigger not working as expected 【发布时间】:2012-07-01 02:14:50 【问题描述】:根据 NVidia,__prof_trigger() 调用是在 warp 级别执行的,或者换句话说,每个 warp 都会将指定的触发器增加 1。
所以我写了一个小内核来测试一下:
__global__ void kernel(int *arr)
__prof_trigger(00);
// from here, it's irrevelant to the question
int id = threadIdx.x + blockDim.x * blockIdx.x;
if (id >= N) return;
__prof_trigger(01);
if (arr[id] < 4) __prof_trigger(02);
else __prof_trigger(03);
我使用以下方法调用程序: ./prof_trigger_test [block_size] [event_name]
此时输入数组无关紧要(我只是在测试 prof_trigger_0,所以它甚至没有被使用)
我只测试了一个块(据我的理解,不同数量的块不应该影响这个问题,还是我错了?)并且事件名称是 CUPTI 给出的名称,或 prof_trigger_XX,它被翻译为运行时的事件 ID。
因此,鉴于经纱大小为 32,我应该期待类似:
./prof_trigger_test 1 prof_trigger_00 // expected to return 1
./prof_trigger_test 33 prof_trigger_00 // expected to return 2
prof_trigger_00 应该在我每次将线程数增加 32 时增加(这需要一个新的 warp)
事实并非如此。在我的笔记本电脑上运行时,使用 NVidia 9600M GT 而不是 32,我只需增加 4 即可看到计数器的增量值。并在远程集群上运行,使用 Tesla M2070,所需增量为 8
很明显我错过了一些东西。 GPU 是否出于某种原因创建了更小的扭曲(性能,我认为,虽然不确定如何)?
这里是完整的代码,根据要求:
#include <cuda.h>
#include <cupti.h>
#include <stdio.h>
#define N 10
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
printf("%s:%d: error %d for CUDA Driver API function '%s'\n", \
__FILE__, __LINE__, err, cufunc); \
exit(-1); \
#define CHECK_CUPTI_ERROR(err, cuptifunc) \
if (err != CUPTI_SUCCESS) \
const char *errstr; \
cuptiGetResultString(err, &errstr); \
printf("%s:%d:Error %s for CUPTI API function '%s'\n", \
__FILE__, __LINE__, errstr, cuptifunc); \
exit(-1); \
typedef struct cupti_eventData_st
CUpti_EventGroup eventGroup;
CUpti_EventID eventId;
cupti_eventData;
// Structure to hold data collected by callback
typedef struct RuntimeApiTrace_st
cupti_eventData *eventData;
uint64_t eventVal;
RuntimeApiTrace_t;
void CUPTIAPI getEventValueCallback(
void *userdata,
CUpti_CallbackDomain domain,
CUpti_CallbackId cbid,
const CUpti_CallbackData *cbInfo)
CUptiResult cuptiErr;
RuntimeApiTrace_t *traceData = (RuntimeApiTrace_t*) userdata;
size_t bytesRead;
// This callback is enabled for launch so we shouldn't see anything else.
if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020)
printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
exit(-1);
switch(cbInfo->callbackSite)
case CUPTI_API_ENTER:
cudaThreadSynchronize();
cuptiErr = cuptiSetEventCollectionMode(cbInfo->context, CUPTI_EVENT_COLLECTION_MODE_KERNEL);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiSetEventCollectionMode");
cuptiErr = cuptiEventGroupEnable(traceData->eventData->eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupEnable");
break;
case CUPTI_API_EXIT:
bytesRead = sizeof(uint64_t);
cudaThreadSynchronize();
cuptiErr = cuptiEventGroupReadEvent(traceData->eventData->eventGroup, CUPTI_EVENT_READ_FLAG_NONE, traceData->eventData->eventId, &bytesRead, &traceData->eventVal);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupReadEvent");
cuptiErr = cuptiEventGroupDisable(traceData->eventData->eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDisable");
break;
static void displayEventVal(RuntimeApiTrace_t *trace, char *eventName)
printf("Event Name: %s \n", eventName);
printf("Event Value: %llu\n", (unsigned long long) trace->eventVal);
__global__ void kernel(int *arr)
__prof_trigger(00);
int id = threadIdx.x + blockDim.x * blockIdx.x;
if (id >= N) return;
__prof_trigger(01);
if (arr[id] < 4) __prof_trigger(02);
else __prof_trigger(03);
int main(int argc, char **argv)
int deviceCount;
CUcontext context = 0;
CUdevice dev = 0;
char deviceName[32];
char *eventName;
CUptiResult cuptiErr;
CUpti_SubscriberHandle subscriber;
cupti_eventData cuptiEvent;
RuntimeApiTrace_t trace;
int cap_major, cap_minor;
CUresult err = cuInit(0);
CHECK_CU_ERROR(err, "cuInit");
err = cuDeviceGetCount(&deviceCount);
CHECK_CU_ERROR(err, "cuDeviceGetCount");
if (deviceCount == 0)
printf("There is no device supporting CUDA.\n");
return -2;
if (argc < 3)
printf("Usage: ./a.out <num_threads> <event_name>\n");
return -2;
err = cuDeviceGet(&dev, 0);
CHECK_CU_ERROR(err, "cuDeviceGet");
err = cuDeviceGetName(deviceName, 32, dev);
CHECK_CU_ERROR(err, "cuDeviceGetName");
err = cuDeviceComputeCapability(&cap_major, &cap_minor, dev);
CHECK_CU_ERROR(err, "cuDeviceComputeCapability");
printf("CUDA Device Name: %s\n", deviceName);
printf("CUDA Capability: %d.%d\n", cap_major, cap_minor);
err = cuCtxCreate(&context, 0, dev);
CHECK_CU_ERROR(err, "cuCtxCreate");
cuptiErr = cuptiEventGroupCreate(context, &cuptiEvent.eventGroup, 0);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupCreate");
int threads = atoi(argv[1]);
eventName = argv[2];
cuptiErr = cuptiEventGetIdFromName(dev, eventName, &cuptiEvent.eventId);
if (cuptiErr != CUPTI_SUCCESS)
printf("Invalid eventName: %s\n", eventName);
return -1;
cuptiErr = cuptiEventGroupAddEvent(cuptiEvent.eventGroup, cuptiEvent.eventId);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupAddEvent");
trace.eventData = &cuptiEvent;
cuptiErr = cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getEventValueCallback, &trace);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiSubscribe");
cuptiErr = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEnableCallback");
int host_arr[] = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9;
int *dev_arr;
cudaMalloc(&dev_arr, sizeof(int) * N);
cudaMemcpy(dev_arr, &host_arr, sizeof(int) * N, cudaMemcpyHostToDevice);
kernel<<< threads, 1 >>>(dev_arr);
displayEventVal(&trace, eventName);
trace.eventData = NULL;
cuptiErr = cuptiEventGroupRemoveEvent(cuptiEvent.eventGroup, cuptiEvent.eventId);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveEvent");
cuptiErr = cuptiEventGroupDestroy(cuptiEvent.eventGroup);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy");
cuptiErr = cuptiUnsubscribe(subscriber);
CHECK_CUPTI_ERROR(cuptiErr, "cuptiUnsubscribe");
cudaDeviceSynchronize();
以及 SASS 代码:
Fatbin ptx code:
================
arch = sm_10
code version = [1,4]
producer = cuda
host = linux
compile_size = 64bit
identifier = cuda.cu
Fatbin elf code:
================
arch = sm_10
code version = [1,2]
producer = cuda
host = linux
compile_size = 64bit
identifier = cuda.cu
code for sm_10
Function : _Z6kernelPi
/*0000*/ ;
/*0008*/ /*0xf0000001e0000004*/ MOV.U16 R0H, g [0x1].U16;
/*0010*/ /*0x100042050023c780*/ I2I.U32.U16 R1, R0L;
/*0018*/ /*0xa000000504000780*/ IMAD.U16 R0, g [0x6].U16, R0H, R1;
/*0020*/ /*0x60014c0100204780*/ ISET.S32.C0 o [0x7f], R0, c [0x1] [0x0], LE;
/*0028*/ /*0x308001fd6c40c7c8*/ RET C0.EQU;
/*0030*/ ;
/*0038*/ /*0x3000000300000500*/ SHL R0, R0, 0x2;
/*0040*/ /*0xf0000401e0000004*/ IADD R0, g [0x4], R0;
/*0048*/ /*0x30020001c4100780*/ GLD.U32 R0, global14 [R0];
/*0050*/ /*0x2000c80104200780*/ ISET.S32.C0 o [0x7f], R0, c [0x1] [0x1], GT;
/*0058*/ /*0xd00e000180c00780*/ BRA C0.NE, 0x70;
/*0060*/ ;
/*0068*/ /*0x308101fd6c4107c8*/ RET;
/*0070*/ ;
/*0078*/ /*0x1000e00300000280*/ NOP;
............................
【问题讨论】:
如果我们不知道命令行选项的含义,那么为您的程序提供命令行是没有用的。您也没有向您的程序提供有关输入数据的信息。更完整的代码(主机代码)会有所帮助。 抱歉,这篇文章吃掉了我解释命令行用法的部分(禁止字符)。我现在修复了它,但我正在使用 ./prof_trigger_test [block_size] [event_name] 调用程序。输入数组无关紧要,但它是一个包含 10 个元素的数组(N=10) 主机代码仅包括创建数组、CUPTI 调用以检索计数器值(基于官方样本)和内核调用 请同时发布主机代码、SASS 代码 (cuobjdump -sass) 和驱动程序版本。编译器倾向于移动和消除 __prof_triggers。旧版本的 CUPTI 只收集 1 个 SM 的结果。最新版本可以收集所有 SM 的结果。 原始帖子已编辑,根据要求添加主机代码和 SASS 转储。也忘了提,但是除了所需的链接器和包含 CUDA 和 CUPTI 的路径之外,所有东西都是在没有优化标志或其他任何东西的情况下编译的 【参考方案1】:prof_trigger_XX 的集合因架构而异。在计算能力为 1.* 的设备上,计数器值仅从 1 个 SM 收集。在计算能力 >= 2 时,从所有 SM 收集计数器值。
GeForce 9600M GPU 是具有 4 个 SM 的 1.1 设备。
示例程序正在启动 [block_size] 块,每个块有 1 个线程。这有点令人困惑,因为我认为您的意思是启动 N 个线程的示例,每个块最多 1 个扭曲。这不是示例代码中实现的内容。
[block_size] prof_trigger_00 expected range
1-3 0-1
4-7 1-2
8-11 2-3
预期值是一个范围,因为无法保证观察到的 SM 是否会是第一个接收到被调度线程块的 SM。
【讨论】:
以上是关于CUPTI / CUDA prof_trigger 未按预期工作的主要内容,如果未能解决你的问题,请参考以下文章
TensorFlow couldn t open CUDA library cupti64 80 dll Intern
anaconda env:cupti64_100.dll找不到
CUDA - CUDA 驱动程序不足以支持 CUDA 运行时版本 6.0