CUDA在内核代码中多次乘法运算

Question

矩阵乘法的函数：

__global__ void gpu_matrix_mult(float *a, float *b, float *c, int m, int n, int k)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0;
    if (col < k && row < m)
    {
        for (int i = 0; i < n; i++)
        {
            sum += a[row * n + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
}

然后在以下循环中调用该函数：

int currentActivityCount = -1;

while (activityCount != currentActivityCount)
{
    if (currentActivityCount > -1)
    {
        cudaMemcpy(d_b, h_b_new, sizeof(int)*m*k, cudaMemcpyHostToDevice);
    }

    gpu_matrix_mult << <dimGrid, dimBlock >> >(d_a, d_b, d_c, m, n, k);

    cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);

    currentActivityCount = activityCount;
    activityCount = 0;

    for (int i = 0; i < m; ++i)
    {
        for (int j = 0; j < k; ++j)
        {
            if (h_c[i*k + j] >= 0.5)
            {
                activityCount++;

                h_b_new[i * k + j] = 1;
            }
            else
            {
                h_b_new[i * k + j] = 0;
            }
        }
    }

    during++;
    printf("Count of activity: %d During: %d
", activityCount, during);
}

我的目标是将此循环移动到“gpu_matrix_mult”函数中，以便GPU之间的数据传输仅发生在调用函数之前和之后的两倍，而不是在循环的每次迭代中。我一直在尝试某些方法，但都没有效果。这种解决方案是否可行？