如何使用推力和 CUDA 流将内存从主机异步复制到设备

Posted 2023-02-19

技术标签:

【中文标题】如何使用推力和 CUDA 流将内存从主机异步复制到设备【英文标题】：How to asynchronously copy memory from the host to the device using thrust and CUDA streams 【发布时间】：2014-09-23 17:19:21 【问题描述】：

我想使用推力将内存从主机复制到设备

thrust::host_vector<float> h_vec(1 << 28);
thrust::device_vector<float> d_vec(1 << 28);
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

使用 CUDA 流类似于使用流将内存从设备复制到设备：

cudaStream_t s;
cudaStreamCreate(&s);

thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28);
thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

问题是我无法将执行策略设置为 CUDA 以在从主机复制到设备时指定流，因为在这种情况下，thrust 会假定两个向量都存储在设备上。有没有办法解决这个问题？我正在使用来自 github 的最新推力版本（它在 version.h 文件中显示为 1.8）。

【问题讨论】：

announcement I read 听起来像是为底层内核调用实现的流，不一定是全面推进。如果您确实使用流从主机向量复制到设备向量，那么您可能希望在主机上使用pinned allocator。因此，我相信您所要求的可以通过推力矢量和cudaMemcpyAsync 来实现。是的，您应该按照 Robert 的建议直接使用 cudaMemcpyAsync。今天（2016 年 5 月），我在这里找到了文档中的第一个条目：thrust.github.io/doc/… 真的很烦人。它说我们可以写thrust::copy(thrust::cuda::par.on(cudaStream), HostPtr, HostPtr+size, DevicePtr);作为一种有效的语法，不会让您的副本异步发布到您传入参数的特定流... 【参考方案1】：

如 cmets 中所示，我认为直接使用 thrust::copy 是不可能的。但是我们可以在推力应用中使用cudaMemcpyAsync来实现异步复制和复制与计算重叠的目标。

这是一个有效的例子：

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>

// DSIZE determines duration of H2D and D2H transfers
#define DSIZE (1048576*8)
// SSIZE,LSIZE determine duration of kernel launched by thrust
#define SSIZE (1024*512)
#define LSIZE 1
// KSIZE determines size of thrust kernels (number of threads per block)
#define KSIZE 64
#define TV1 1
#define TV2 2

typedef int mytype;
typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;

struct sum_functor

  mytype *dptr;
  sum_functor(mytype* _dptr) : dptr(_dptr) ;
  __host__ __device__ void operator()(mytype &data) const
    
      mytype result = data;
      for (int j = 0; j < LSIZE; j++)
        for (int i = 0; i < SSIZE; i++)
          result += dptr[i];
      data = result;
    
;

int main()

  pinnedVector hi1(DSIZE);
  pinnedVector hi2(DSIZE);
  pinnedVector ho1(DSIZE);
  pinnedVector ho2(DSIZE);
  thrust::device_vector<mytype> di1(DSIZE);
  thrust::device_vector<mytype> di2(DSIZE);
  thrust::device_vector<mytype> do1(DSIZE);
  thrust::device_vector<mytype> do2(DSIZE);
  thrust::device_vector<mytype> dc1(KSIZE);
  thrust::device_vector<mytype> dc2(KSIZE);

  thrust::fill(hi1.begin(), hi1.end(),  TV1);
  thrust::fill(hi2.begin(), hi2.end(),  TV2);
  thrust::sequence(do1.begin(), do1.end());
  thrust::sequence(do2.begin(), do2.end());

  cudaStream_t s1, s2;
  cudaStreamCreate(&s1); cudaStreamCreate(&s2);

  cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);

  thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
  thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));

  cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);

  cudaDeviceSynchronize();
  for (int i=0; i < KSIZE; i++)
    if (ho1[i] != ((LSIZE*SSIZE*TV1) + i))  std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;
    if (ho2[i] != ((LSIZE*SSIZE*TV2) + i))  std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;
    
  std::cout << "Success!" << std::endl;
  return 0;

对于我的测试用例，我使用了 RHEL5.5、Quadro5000 和 cuda 6.5RC。此示例旨在让推力创建非常小的内核（只有一个线程块，只要 KSIZE 很小，比如 32 或 64），以便推力从 thrust::for_each 创建的内核能够同时运行。

当我分析这段代码时，我看到：

这表明我们正在实现推力内核之间以及复制操作和推力内核之间的适当重叠，以及内核完成时的异步数据复制。请注意，cudaDeviceSynchronize() 操作“填充”了时间线，表明所有异步操作（数据复制、推力函数）都是异步发出的，并且在任何操作进行之前控制返回给主机线程。所有这些都是预期的，主机、GPU 和数据复制操作之间完全并发的正确行为。

【讨论】：

是否需要使用 pf pinned_allocator 才能使 cudaMemcpyAsync 在 thrust::host_vector 上正常工作？如果我使用标准的thrust::host_vector 会发生什么？标准 host_vector 使用未固定（即非页面锁定）分配器。这意味着当您尝试执行cudaMemcpyAsync 时，操作将不会是异步的。如果您试图将该操作与其他操作重叠，它不会重叠。看看here和here 请注意，CUDA 7 附带的推力版本有一个issue，在某些情况下会阻止将推力内核正确发布到流。解决方法是 1. 将 CUDA 7 上的推力更新为 current development version（其中包括对该问题的修复），或 2. 恢复到 CUDA 6.5（或升级到某些未来的 CUDA 工具包版本，当它可用时。 ) 这是在 7.5 中修复还是仍然存在？是的，CUDA 7.5 附带的推力版本已经修复了这个问题。【参考方案2】：

这是一个使用 thrust::cuda::experimental::pinned_allocator<T> 的工作示例：

// Compile with:
// nvcc --std=c++11 mem_async.cu -o mem_async

#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/fill.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#define LEN 1024

int main(int argc, char *argv[]) 
    thrust::host_vector<float, thrust::cuda::experimental::pinned_allocator<float>> h_vec(LEN);
    thrust::device_vector<float> d_vec(LEN);

    thrust::fill(d_vec.begin(), d_vec.end(), -1.0);

    cudaMemcpyAsync(thrust::raw_pointer_cast(h_vec.data()),
                    thrust::raw_pointer_cast(d_vec.data()),
                    d_vec.size()*sizeof(float),
                    cudaMemcpyDeviceToHost);

    // Comment out this line to see what happens.
    cudaDeviceSynchronize();

    std::cout << h_vec[0] << std::endl;

注释掉同步步骤，由于异步内存传输，您应该将0 打印到控制台。

【讨论】：