在 CUDA 中处理 4D 张量的内核

Posted 2023-02-22

技术标签:

【中文标题】在 CUDA 中处理 4D 张量的内核【英文标题】：Kernel for processing a 4D tensor in CUDA 【发布时间】：2015-12-30 12:15:51 【问题描述】：

我想编写一个内核来执行依赖于所有唯一四重索引 (ij|kl) 的计算。在主机上生成所有唯一四重奏的代码如下：

#include <iostream>


using namespace std;

int main(int argc, char** argv)

    unsigned int i,j,k,l,ltop;
    unsigned int nao=7;
    for(i=0;i<nao;i++)
    
        for(j=0;j<=i;j++)
        
            for(k=0;k<=i;k++)
            
                ltop=k;
                if(i==k)
                
                    ltop=j;
                
                for(l=0;l<=ltop; l++)
                
                    printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
                
                
        
    

    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl;

    return 0;

输出：

computing the ERI (0,0|0,0) 
computing the ERI (1,0|0,0) 
computing the ERI (1,0|1,0) 
computing the ERI (1,1|0,0) 
computing the ERI (1,1|1,0) 
computing the ERI (1,1|1,1) 
computing the ERI (2,0|0,0) 
computing the ERI (2,0|1,0) 
computing the ERI (2,0|1,1) 
computing the ERI (2,0|2,0) 
computing the ERI (2,1|0,0) 
computing the ERI (2,1|1,0) 
computing the ERI (2,1|1,1) 
computing the ERI (2,1|2,0) 
computing the ERI (2,1|2,1) 
computing the ERI (2,2|0,0) 
computing the ERI (2,2|1,0) 
computing the ERI (2,2|1,1) 
computing the ERI (2,2|2,0) 
computing the ERI (2,2|2,1) 
computing the ERI (2,2|2,2) 
computing the ERI (3,0|0,0) 
computing the ERI (3,0|1,0) 
computing the ERI (3,0|1,1) 
computing the ERI (3,0|2,0) 
computing the ERI (3,0|2,1) 
computing the ERI (3,0|2,2) 
computing the ERI (3,0|3,0) 
computing the ERI (3,1|0,0) 
computing the ERI (3,1|1,0) 
computing the ERI (3,1|1,1) 
computing the ERI (3,1|2,0) 
computing the ERI (3,1|2,1) 
computing the ERI (3,1|2,2) 
computing the ERI (3,1|3,0) 
computing the ERI (3,1|3,1) 
computing the ERI (3,2|0,0) 
computing the ERI (3,2|1,0) 
computing the ERI (3,2|1,1) 
computing the ERI (3,2|2,0) 
computing the ERI (3,2|2,1) 
computing the ERI (3,2|2,2) 
computing the ERI (3,2|3,0) 
computing the ERI (3,2|3,1) 
computing the ERI (3,2|3,2) 
computing the ERI (3,3|0,0) 
computing the ERI (3,3|1,0) 
computing the ERI (3,3|1,1) 
computing the ERI (3,3|2,0) 
computing the ERI (3,3|2,1) 
computing the ERI (3,3|2,2) 
computing the ERI (3,3|3,0) 
computing the ERI (3,3|3,1) 
computing the ERI (3,3|3,2) 
computing the ERI (3,3|3,3)

我想使用 CUDA 内核恢复同一组四重奏，但我无法得到它。我现在拥有的CUDA代码如下

#include <iostream>
#include <stdio.h>

using namespace std;

#define ABS(x)  (x<0)?-x:x

__global__
void test_kernel(int basis_size)

    unsigned int i_idx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int j_idx = threadIdx.y + blockIdx.y * blockDim.y;


    // Building the quartets                                                                                                                                                  

    unsigned int i_orbital, j_orbital, k_orbital, l_orbital, i__1,j__1;
    unsigned int i_primitive, j_primitive, k_primitive, l_primitive;

    i_orbital = (i_idx + 1)%basis_size /*+1*/;
    j_orbital = (i__1 = (i_idx) / basis_size, ABS(i__1)) /*+ 1*/;

    k_orbital = (j_idx+1)%basis_size /*+1*/;
    l_orbital = (j__1 = (j_idx) / basis_size, ABS(j__1)) /*+ 1*/;
    unsigned int ltop;
    ltop=k_orbital;
    if(i_orbital==k_orbital)
    
        ltop=j_orbital;
    
    if(i_orbital<basis_size && j_orbital<=i_orbital && k_orbital<=i_orbital && l_orbital<=ltop)
        printf("computing the ERI (%d,%d|%d,%d)   \n", i_orbital, j_orbital,k_orbital,l_orbital);


int main(int argc, char *argv[])

    int nao = 7;
    cudaDeviceReset();
    /* partitioning from blocks to grids */
    int dimx = 8;
    int dimy = 8;
    dim3 block(dimx, dimy);   // we will try blocks of 8x8 threads                                                                                                            
    dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);   // The grids are shaped accordingly                                                                        

    /* Launch the kernel */
    test_kernel<<<grid,block>>>(nao);
    cudaDeviceReset();

    return 0;

输出：

computing the ERI (2,0|1,1)    
computing the ERI (3,1|3,1)    
computing the ERI (3,0|1,1)    
computing the ERI (1,1|1,1)    
computing the ERI (2,1|1,1)    
computing the ERI (3,1|1,1)    
computing the ERI (3,0|2,1)    
computing the ERI (2,1|2,1)    
computing the ERI (3,1|2,1)    
computing the ERI (1,0|1,0)    
computing the ERI (2,0|1,0)    
computing the ERI (3,0|1,0)    
computing the ERI (1,1|1,0)    
computing the ERI (2,1|1,0)    
computing the ERI (3,1|1,0)    
computing the ERI (2,0|2,0)    
computing the ERI (3,0|3,0)    
computing the ERI (3,0|2,0)    
computing the ERI (2,1|2,0)    
computing the ERI (3,1|3,0)    
computing the ERI (3,1|2,0)    
computing the ERI (1,0|0,0)    
computing the ERI (2,0|0,0)    
computing the ERI (3,0|0,0)    
computing the ERI (0,0|0,0)    
computing the ERI (1,1|0,0)    
computing the ERI (2,1|0,0)    
computing the ERI (3,1|0,0)

四重奏将驱动斥力积分的计算，其输入参数存储在大小为 3 的 nao 数组中

【问题讨论】：

这可能是 CUDA printf 的输出缓冲区溢出的结果。 【参考方案1】：

如前所述，您的代码并没有“做太多”。它生成索引并将它们打印出来。此外，尚不清楚nao=7 是否真的描述了您的最终问题规模，还是仅用于演示目的。

纯粹从索引生成的角度来看，有很多方法可以解决您的问题。其中一些可能自然而然地有助于高效使用 GPU，而另一些则可能不会。但是从您到目前为止显示的代码中无法真正确定 GPU 的有效使用。例如，CUDA 程序的理想目标之一是从全局内存中合并访问。由于您的代码没有显示任何内容，因此在不知道生成的访问模式可能是什么的情况下提出解决方案有点冒险。

因此，在对索引生成做出最终决定之前，您尝试解决的“真正问题”可能需要考虑一些考虑因素（这实际上等同于“将工作分配给给定线程”） .我会尝试提及其中的一些。

对您的代码的一些批评：

您提出的实现建议使用 2D 线程块和网格结构来有效地提供程序使用的 i,j 索引。但是，通过查看原始 C 源代码，我们看到 j 索引只覆盖到 i 索引的空间：

for(i=0;i<nao;i++)

    for(j=0;j<=i;j++)

但是，用 2D 网格/线程块结构替换这种结构会创建一个矩形空间，原始 C 代码中的嵌套 for 循环并未暗示这一点。因此，启动这样的空间/网格将导致创建超出范围的 i,j 索引；这些线程不需要做任何工作。我们可以通过“重新利用”超出范围的线程来覆盖一些额外的空间来解决这个问题（也许这就是你的代码试图做的），但这会导致相当复杂且难以阅读的算术，并且看下一个批评。

您的内核中没有循环，因此每个线程似乎只能生成一行打印输出。因此，每个线程只能负责一个 ERI 条目。根据您提供的 C 源代码，对于大小为 7 的 nao，您预计有 406 个 ERI 条目（对于您显示的代码，您发布的打印输出似乎不完整，顺便说一句）。如果我们每个线程有一个打印输出，并且我们需要覆盖 406 个 ERI 条目的空间，我们最好至少有 406 个线程，否则我们提出的解决方案根本无法工作。如果我们检查您的代码以确定线程方面的网格大小：

int dimx = 8;
int dimy = 8;
dim3 block(dimx, dimy);   // we will try blocks of 8x8 threads                                                                                                            
dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);   // The grids are shaped accordingly

我们将得出结论，您根本没有启动足够的线程。如果您完成上述计算（如果不确定，您可能想要打印出 block.x,.y 和 grid.x,.y 值），您会发现您正在启动 one 8x8 线程块，即总共 64 个线程，nao 共 7 个。64 个线程，每个线程一个打印输出，不能覆盖 406 个条目的 ERI 空间。事实上，鉴于您的printf 受if 语句的限制，可能每个线程的打印输出少于一个。

可能的解决思路：

一种相当简单的方法是将 C 源代码中的外部两个 for 循环映射到 2D 网格的 x,y 索引。然后，我们将或多或少完整地保留您的 C 源代码的内部 for 循环结构，作为我们的内核代码。这很容易写。它的缺点是它会启动一些什么都不做的线程（我们必须检查 j>i 的条件，并且条件这样的线程什么都不做），但这可能是一个次要的考虑。一个更大的问题可能是我们生成了多少实际线程。对于您给定的 7 个nao，这将启动 7x7 = 49 个线程（其中一些不起作用）。对于 GPU 来说，49 甚至 406 线程的问题规模很小，由于线程数量有限，它无法达到峰值性能。这个问题的“三角形”性质（jlinear-to-triangular index mapping，这将导致没有“浪费”的线程。然而，一个比大约 50% 浪费线程更重要的考虑因素是全局访问的合并，正如已经讨论的那样。

另一种可能的方法是使用@AngryLettuce 在对您之前的一个问题（现在可能已删除）的评论中建议的方法。具体来说，生成一维网格和一维全局唯一线程索引，然后使用算术计算必要的子索引（i，j，k，l），将一维索引转换为子索引。这样做的好处是，对于小问题，我们可以直接启动更大的内核。例如，对于 406 个 ERI 条目的问题空间，我们将生成（至少）406 个线程，全局唯一索引为 0..405，并将该一维索引转换为 4 个子索引（i、j、k、 l)。查看您的内核代码，这可能是您正在尝试做的事情。但是，由于您的空间形状奇特，我认为从线性索引（或任何一组矩形索引）转换为形状奇特的空间的算法会很复杂。

如果您的实际问题空间很大（nao 远大于 7），那么我个人会选择第一种方法，因为它的代码可读性更高（IMO）并且易于编写/维护。对于小的问题空间，第二种方法可能更好，原因已经讨论过了。对于上述任何一种方法，我们都希望研究生成的全局内存访问模式。这将取决于您未显示的数据组织。第一种方法可能更容易编写以生成合并访问，但在第二种方法中仔细使用索引生成算法应该（理论上）允许您实现您希望的任何访问模式。

下面是方法一的完整代码：

#include <stdio.h>
#include <iostream>

using namespace std;

__global__ void kernel(unsigned int nao)

    unsigned i = threadIdx.x+blockDim.x*blockIdx.x;
    unsigned j = threadIdx.y+blockDim.y*blockIdx.y;
    if (i >= nao) return;
    if (j > i) return; // modify if use of __syncthreads() after this

    unsigned int k,l,ltop;
            for(k=0;k<=i;k++)
            
                ltop=k;
                if(i==k)
                
                    ltop=j;
                
                for(l=0;l<=ltop; l++)
                
                    printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
                
            



int main()
    unsigned int nao = 7;
    dim3 block(4,4);
    dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);
    kernel<<<grid,block>>>(nao);
    cudaDeviceSynchronize();
    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl;

    return 0;

请注意，为了便于在此代码中演示（对于不工作的线程），我使用了条件返回语句。但是，如果您需要在内核中使用__syncthreads()，您将需要修改内核结构以避免条件返回，而是允许“越界”线程在不工作的情况下继续通过内核代码。关于__syncthreads() 用法的许多其他 SO 问题都涵盖了这个概念。另请注意，内核打印输出可以按任何顺序发生（就像线程可以按任何顺序执行一样），所以我只验证了上述方法似乎有效并产生了所需的 406 行打印输出。更好的验证方法是避免使用 printf 并改用计数矩阵。

对于第二种方法，从单个线性索引到多维空间（具有 1:1 映射并且没有浪费/未使用的索引）的转换相当复杂，正如已经提到的，因为您的多维空间是“奇形怪状”。我没有从线性索引转换为子索引的“好”方法。我必须根据 i,j,k 索引为 l 循环绘制各种问题空间维度大小：

i=0, j=0, k=0, l=0  (total 1 iteration for i=0)

l loop limit, for i=1: (total 5 iterations for i=1)
     j
    0 1
k 0 0 0
  1 0 1

l loop limit, for i=2: (total 15 iterations for i=2)
     j
    0 1 2
k 0 0 0 0
  1 1 1 1
  2 0 1 2

l loop limit, for i=3:  (total 34 iterations for i=3)
     j
    0 1 2 3
k 0 0 0 0 0
  1 1 1 1 1
  2 2 2 2 2
  3 0 1 2 3

 etc.

（请注意，另一种映射方法是将上面每个矩阵的最后一行视为等于k 的常数，就像其他行一样，这将导致一些线程浪费但大大简化了下面的索引计算。我们将简要讨论不浪费线程方法，但最终会给出一个使用浪费线程方法的示例，因为索引计算在算术和时间复杂度上都大大简化了）。

研究以上内容，我们看到映射到给定i循环迭代的迭代次数为：

[((i+2)*(i+1)/2)*(i+1)] - (i+1)*i/2

或：

(((i+2)*(i+1)-i)/2)*(i+1)

上述计算可用于根据线性索引确定i 的索引。我知道使用上述断点剖析线性索引的唯一方法是通过二进制搜索。

然后我们将重复上述方法（确定每个“级别”的迭代次数，并使用二进制搜索将线性索引映射到级别索引）以计算下一个索引 j 和 k。剩余的数量将是 l-index。

为了简化余下的讨论，我改用修改后的映射版本：

l loop limit, for i=1: (total 6 iterations for i=1)
     j
    0 1
k 0 0 0
  1 1 1

l loop limit, for i=2: (total 18 iterations for i=2)
     j
    0 1 2
k 0 0 0 0
  1 1 1 1
  2 2 2 2

l loop limit, for i=3:  (total 40 iterations for i=3)
     j
    0 1 2 3
k 0 0 0 0 0
  1 1 1 1 1
  2 2 2 2 2
  3 3 3 3 3

 etc.

从中我们将有一些浪费的线程，我们将在内核中使用 if 条件进行处理。

上面每个i-level的迭代次数就是：

(i+2)*(i+1)*(i+1)/2

一旦给定线性索引（在summation of the above polynomial 上使用二分搜索），我们从上述公式计算出 i-index，然后我们可以很容易地计算下一个索引 (j)，方法是将剩余空间划分为 i大小相等的碎片。可以使用triangular mapping method 找到下一个 k 索引，然后剩余空间成为我们的 l 循环范围。但是，我们必须记住按照前面讨论的那样调整这个 l 索引。

这是方法2的完整代码：

#include <stdio.h>
#include <iostream>

// the closed-form summation of the polynomial (i+2)*(i+1)*(i+1)/2 from 0 to n
__host__ __device__ unsigned my_func(unsigned i) return 1+(2*i+(((i*(i+1))/2)*((i*(i+1))/2)) + (i*(i+1)*((2*i)+1)*2)/3 + ((5*i*(i+1))/2))/2; 

// binary search
__host__ __device__ unsigned bsearch_functional(const unsigned key, const unsigned len_a, unsigned (*func)(unsigned))
  unsigned lower = 0;
  unsigned upper = len_a;
  unsigned midpt;
  while (lower < upper)
    midpt = (lower + upper)>>1;
    if (func(midpt) <= key) lower = midpt +1;
    else upper = midpt;
    
  return lower;


// conversion of linear index to triangular matrix (row,col) index
__host__ __device__ void linear_to_triangular(const unsigned i, const unsigned n, unsigned *trow, unsigned *tcol)

    int c = i;
    int row = c/(n-1);
    int col = c%(n-1);

    if (col < row) 
      col = (n-2)-col;
      row = (n-1)-row;
    

    *tcol = col+1;
    *trow = row;




__global__ void kernel(unsigned int nao, unsigned int eris_size)

    unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < eris_size)
    // compute i-index via binary search
    unsigned int i = bsearch_functional(idx, nao, my_func);
    // compute j-index via division of the space by i;
    unsigned int j = (idx==0)?0:(idx-my_func(i-1))/((my_func(i)-my_func(i-1))/(i+1));
    unsigned k,l;
    linear_to_triangular((idx-my_func(i-1)-(j *((my_func(i)-my_func(i-1))/(i+1)))), (i+2), &k, &l);
    k = i-k;
    l = (i+1)-l;
    if (idx == 0) k=0; l=0;
    if (l <= ((i==k)?j:k))
      printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
    


int main()
    unsigned int nao = 7;
    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    const int nTPB = 64;
    std::cout<<"The total size of the sack of ERIs is: "<<eris_size<<std::endl;
    int mod_eris_size = my_func(nao-1);
    kernel<<<mod_eris_size+nTPB-1/nTPB, nTPB>>>(nao, mod_eris_size);
    cudaDeviceSynchronize();
    return 0;

需要明确的是，我不确切知道您的任务是什么，并且我不保证这些示例对于任何给定的用法都没有错误。我的目的不是给你一个黑匣子，而是解释一些可能的编程方法。我没有做严格的验证，只是观察到每个方法产生了 406 行 ERI 输出，和你原来的 C 代码一样。

最后，为了简洁起见，我省略了proper cuda error checking，但是任何时候您在使用 cuda 代码时遇到问题，我建议您使用它并使用 cuda-memcheck 运行您的代码。

【讨论】：

非常感谢您的建议。我希望能够管理 1015 时串行代码开始变慢。四重奏 (ij|kl) 中的每个整数都指向数组中的索引，该数组包含用于计算一个整数的输入参数。对于nao~100，第一种方法可能更容易使用，并且应该会产生良好的效果。对于nao~20或更少，您可能想尝试第二种方法。我使用这两种方法运行测试计算并且效果很好，但是出现了一个新的困难：GPU 中的可用内存通常不足以一次存储所有结果。是否可以通过不重叠的索引块来调整方法 1 和方法 2？我认为方法 1 很容易适应。如果您可以编写原始的 4 嵌套循环 C 代码来处理块，那么您应该能够重新编写方法 1 内核来处理块。也许你应该问一个新问题。

以上是关于在 CUDA 中处理 4D 张量的内核的主要内容，如果未能解决你的问题，请参考以下文章