深度学习部署笔记(十五): CUDA_Run_Time_API_parallel_多流并行，以及多流之间互相同步等待的操作方式

Posted 2023-03-30 智障学AI

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了深度学习部署笔记(十五): CUDA_Run_Time_API_parallel_多流并行，以及多流之间互相同步等待的操作方式相关的知识，希望对你有一定的参考价值。

// CUDA运行时头文件
#include <cuda_runtime.h>

#include <chrono>
#include <stdio.h>
#include <string.h>

using namespace std;

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line)
    if(code != cudaSuccess)    
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \\n  code = %s, message = %s\\n", file, line, op, err_name, err_message);   
        return false;
    
    return true;


__global__ void add_vector(const float* a, const float* b, float* c, int count)

    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if(index >= count) return;
    c[index] = a[index] + b[index];


__global__ void mul_vector(const float* a, const float* b, float* c, int count)

    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if(index >= count) return;
    c[index] = a[index] * b[index];


cudaStream_t stream1, stream2;
float *a, *b, *c1, *c2;
const int num_element = 100000;
const size_t bytes = sizeof(float) * num_element;
const int blocks = 512;
const int grids = (num_element + blocks - 1) / blocks;
const int ntry = 1000;

// 多个流异步
void async()

    cudaEvent_t event_start1, event_stop1;
    cudaEvent_t event_start2, event_stop2;
    checkRuntime(cudaEventCreate(&event_start1));
    checkRuntime(cudaEventCreate(&event_stop1));
    checkRuntime(cudaEventCreate(&event_start2));
    checkRuntime(cudaEventCreate(&event_stop2));

    auto tic = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;
    checkRuntime(cudaEventRecord(event_start1, stream1));
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream1>>>(a, b, c1, num_element);
    checkRuntime(cudaEventRecord(event_stop1, stream1));
    
    checkRuntime(cudaEventRecord(event_start2, stream2));
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream2>>>(a, b, c2, num_element);
    checkRuntime(cudaEventRecord(event_stop2, stream2));

    checkRuntime(cudaStreamSynchronize(stream1));
    checkRuntime(cudaStreamSynchronize(stream2));
    auto toc = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;

    float time1, time2;
    checkRuntime(cudaEventElapsedTime(&time1, event_start1, event_stop1));
    checkRuntime(cudaEventElapsedTime(&time2, event_start2, event_stop2));
    printf("async: time1 = %.2f ms, time2 = %.2f ms, count = %.2f ms\\n", time1, time2, toc - tic);


// 单个流串行
void sync()

    cudaEvent_t event_start1, event_stop1;
    checkRuntime(cudaEventCreate(&event_start1));
    checkRuntime(cudaEventCreate(&event_stop1));

    auto tic = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;
    checkRuntime(cudaEventRecord(event_start1, stream1));
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream1>>>(a, b, c1, num_element);
    
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream1>>>(a, b, c2, num_element);
    checkRuntime(cudaEventRecord(event_stop1, stream1));

    checkRuntime(cudaStreamSynchronize(stream1));
    auto toc = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;

    float time1;
    checkRuntime(cudaEventElapsedTime(&time1, event_start1, event_stop1));
    printf("sync: time1 = %.2f ms, count = %.2f ms\\n", time1, toc - tic);


// 多个流之间并行
void multi_stream_async()

    // 这个案例主要实现多个流之间互相等待，使用event控制实现
    // 存在step1  ->  step2 \\ 
    //                      ->  step3   ->  step4
    //               stepa / 
    //
    // 这个案例中，存在流程1：step1 -> step2的流程
    //           存在流程2：stepa
    //           存在流程3：step3 -> step4，step3要求step2与stepa作为输入
    // 此时，可以让流程1使用stream1，流程2使用stream2，而流程3继续使用stream1，仅仅在stream1中加入等待（event的等待）

    // step1 = add_vector
    // step2 = mul_vector
    // step3 = add_vector
    // step4 = mul_vector
    // stepa = add_vector
    #define step1 add_vector
    #define step2 mul_vector
    #define step3 add_vector
    #define step4 mul_vector
    #define stepa add_vector

    cudaEvent_t event_async;
    checkRuntime(cudaEventCreate(&event_async));

    // stream1的执行流程
    step1<<<grids, blocks, 0, stream1>>>(a, b, c1, num_element);
    step2<<<grids, blocks, 0, stream1>>>(a, b, c1, num_element);

    // 等待event_async有事件
    checkRuntime(cudaStreamWaitEvent(stream1, event_async));
    step3<<<grids, blocks, 0, stream1>>>(a, b, c2, num_element);
    step4<<<grids, blocks, 0, stream1>>>(a, b, c2, num_element);

    // stream2的执行流程
    stepa<<<grids, blocks, 0, stream2>>>(a, b, c2, num_element);
    
    // 为event_async触发事件，通知cudaStreamWaitEvent函数可以继续了
    checkRuntime(cudaEventRecord(event_async, stream2));
    checkRuntime(cudaStreamSynchronize(stream1));

    printf("multi_stream_async done.\\n");


int main()

    // 本程序实现两个核函数的并行，通过多个流实现
    
    checkRuntime(cudaStreamCreate(&stream1));
    checkRuntime(cudaStreamCreate(&stream2));

    checkRuntime(cudaMalloc(&a, bytes));
    checkRuntime(cudaMalloc(&b, bytes));
    checkRuntime(cudaMalloc(&c1, bytes));
    checkRuntime(cudaMalloc(&c2, bytes));

    // 演示多流之间的异步执行
    async();

    // 演示单个流内的同步执行
    sync();

    // 演示多个流之间互相等待的操作
    multi_stream_async();
    return 0;

2. 单个流串行

void sync()

    cudaEvent_t event_start1, event_stop1;
    checkRuntime(cudaEventCreate(&event_start1));
    checkRuntime(cudaEventCreate(&event_stop1));

    auto tic = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;
    checkRuntime(cudaEventRecord(event_start1, stream1));
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream1>>>(a, b, c1, num_element);
    
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream1>>>(a, b, c2, num_element);
    checkRuntime(cudaEventRecord(event_stop1, stream1));

    checkRuntime(cudaStreamSynchronize(stream1));
    auto toc = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;

    float time1;
    checkRuntime(cudaEventElapsedTime(&time1, event_start1, event_stop1));
    printf("sync: time1 = %.2f ms, count = %.2f ms\\n", time1, toc - tic);

cuda count time: 12.26, cpp count time: 12.28

这个函数演示了单个流中的同步执行，具体解释如下：

cudaEvent_t 是 CUDA Runtime API 中的一个结构体，定义在 cuda_runtime_api.h 中。它用于表示一个 CUDA 事件对象，用于记录 GPU 上某个时间点的状态。

CUDA 事件可以用于两种目的：

记录一个时间点（如开始时间点或结束时间点）。
记录一个时间间隔（即时间差）。
通常情况下，CUDA 事件被用于在主机和设备之间进行同步，或在设备内部进行同步。例如，可以在主机代码中调用 cudaEventRecord() 来记录一个事件，然后在设备代码中使用 cudaStreamWaitEvent() 等待该事件，以确保某些设备操作发生在之前记录的事件之后。又或者，可以在设备代码中记录两个事件，然后在主机代码中使用 cudaEventElapsedTime() 计算它们之间的时间差。

首先创建两个事件 event_start1 和 event_stop1，用于记录同步执行的时间；

使用 cudaEventRecord 将 event_start1 记录在 stream1 中，表示从这个时间点开始，将会执行在 stream1 中的操作；

使用 for 循环调用 add_vector 核函数，在 stream1 中执行 ntry 次，计算向量 a 和 b 的加和，存储在向量 c1 和 c2 中；

使用 cudaEventRecord 将 event_stop1 记录在 stream1 中，表示到达这个时间点，stream1 中的操作都已经完成；

使用 cudaStreamSynchronize 等待 stream1 中的所有操作执行完毕；

计算同步执行的时间 time1，并输出时间和整个操作的时间。

可以看到，这个函数中只使用了一个流，因此 add_vector 的计算是按照顺序执行的，不能充分发挥 GPU 的并行计算能力。因此，这个函数的计算时间会比异步执行的 async 函数要长

这段代码中使用了两种方法来计算代码执行的时间。

第一种方法是使用了C++标准库中的chrono库来计算代码执行的起始时间和终止时间，通过计算时间差得到代码执行的时间，这个方法在计算异步执行时比较方便，因为我们需要分别记录多个异步操作的起始时间和终止时间。

第二种方法是使用了CUDA提供的API cudaEventElapsedTime，这个API可以计算CUDA事件的时间差，用于计算CUDA事件执行的时间。在这个例子中，我们使用了这个API来计算在单个流上串行执行的时间。

3. 向量相加相乘的kernel function

__global__ void add_vector(const float* a, const float* b, float* c, int count)

    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if(index >= count) return;
    c[index] = a[index] + b[index];


__global__ void mul_vector(const float* a, const float* b, float* c, int count)

    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if(index >= count) return;
    c[index] = a[index] * b[index];

count 是用来限制线程不要访问到超出数组的地址，因为数组的长度在我们开辟的时候就已经定义好了

checkRuntime(cudaMalloc(&a, bytes));

count是num_element, byte是num_element * sizeof(float), 超出地址会访问到虚拟地址

4. 多个流的异步

void async()

    cudaEvent_t event_start1, event_stop1;
    cudaEvent_t event_start2, event_stop2;
    checkRuntime(cudaEventCreate(&event_start1));
    checkRuntime(cudaEventCreate(&event_stop1));
    checkRuntime(cudaEventCreate(&event_start2));
    checkRuntime(cudaEventCreate(&event_stop2));

    auto tic = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;
    checkRuntime(cudaEventRecord(event_start1, stream1));
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream1>>>(a, b, c1, num_element);
    checkRuntime(cudaEventRecord(event_stop1, stream1));
    
    checkRuntime(cudaEventRecord(event_start2, stream2));
    for(int i = 0; i < ntry; ++i)
        add_vector<<<grids, blocks, 0, stream2>>>(a, b, c2, num_element);
    checkRuntime(cudaEventRecord(event_stop2, stream2));

    checkRuntime(cudaStreamSynchronize(stream1));
    checkRuntime(cudaStreamSynchronize(stream2));
    auto toc = chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count() / 1000.0;

    float time1, time2;
    checkRuntime(cudaEventElapsedTime(&time1, event_start1, event_stop1));
    checkRuntime(cudaEventElapsedTime(&time2, event_start2, event_stop2));
    printf("async: time1 = %.2f ms, time2 = %.2f ms, count = %.2f ms\\n", time1, time2, toc - tic);

async: time1 = 6.97 ms, time2 = 6.94 ms, count = 9.32 ms

输出的内容中包含了在两个流上异步执行的两个内核函数的时间，分别为time1和time2，它们的值应该是相当接近的。同时，输出中还包含了整个函数执行的总时间count，可以看出相比于同步执行的情况，异步执行使得程序的总执行时间更短，效率更高。

5. 多个流之间互相等待的操作

// 这个案例主要实现多个流之间互相等待，使用event控制实现
    // 存在step1  ->  step2 \\ 
    //                      ->  step3   ->  step4
    //               stepa / 
    //
    // 这个案例中，存在流程1：step1 -> step2的流程
    //           存在流程2：stepa
    //           存在流程3：step3 -> step4，step3要求step2与stepa作为输入
    // 此时，可以让流程1使用stream1，流程2使用stream2，而流程3继续使用stream1，仅仅在stream1中加入等待（event的等待）

    // step1 = add_vector
    // step2 = mul_vector
    // step3 = add_vector
    // step4 = mul_vector
    0x0. 前言 
在上一节中，我们将Toy Dialect的部分Operation Lowering到Affine Dialect，MemRef Dialect和Standard Dialect，而toy.print操作保持不变，所以又被叫作部分Lowering。通过这个Lowering可以将Toy Dialect的Operation更底层的实现逻辑表达出来，以寻求更多的优化机会，得到更好的MLIR表达式。这一节，我们将在上一节得到的混合型MLIR表达式完全Lowering到LLVM Dialect上，然后生成LLVM IR，并且我们可以使用MLIR的JIT编译引擎来运行最终的MLIR表达式并输出计算结果。 
0x1. IR下降到LLVM Dialect 
这一小节我们将来介绍如何将上一节结束的MLIR表达式完全Lowering为LLVM Dialect，我们还是回顾一下上一节最终的MLIR表达式： 
func @main() 
  %cst = arith.constant 1.000000e+00 : f64
  %cst_0 = arith.constant 2.000000e+00 : f64
  %cst_1 = arith.constant 3.000000e+00 : f64
  %cst_2 = arith.constant 4.000000e+00 : f64
  %cst_3 = arith.constant 5.000000e+00 : f64
  %cst_4 = arith.constant 6.000000e+00 : f64

  // Allocating buffers for the inputs and outputs.
  %0 = memref.alloc() : memref<3x2xf64>
  %1 = memref.alloc() : memref<2x3xf64>

  // Initialize the input buffer with the constant values.
  affine.store %cst, %1[0, 0] : memref<2x3xf64>
  affine.store %cst_0, %1[0, 1] : memref<2x3xf64>
  affine.store %cst_1, %1[0, 2] : memref<2x3xf64>
  affine.store %cst_2, %1[1, 0] : memref<2x3xf64>
  affine.store %cst_3, %1[1, 1] : memref<2x3xf64>
  affine.store %cst_4, %1[1, 2] : memref<2x3xf64>

  affine.for %arg0 = 0 to 3 
    affine.for %arg1 = 0 to 2 
      // Load the transpose value from the input buffer.
      %2 = affine.load %1[%arg1, %arg0] : memref<2x3xf64>

      // Multiply and store into the output buffer.
      %3 = arith.mulf %2, %2 : f64
      affine.store %3, %0[%arg0, %arg1] : memref<3x2xf64>
    
  

  // Print the value held by the buffer.
  toy.print %0 : memref<3x2xf64>
  memref.dealloc %1 : memref<2x3xf64>
  memref.dealloc %0 : memref<3x2xf64>
  return

 
我们要将这个三种Dialect混合的MLIR表达式完全Lowering为LLVM Dialect，注意LLVM Dialect是MLIR的一种特殊的Dialect层次的中间表示，它并不是LLVM IR。Lowering为LLVM Dialect的整体过程可以分为如下几步： 
1. Lowering toy.print Operation 
之前部分Lowering的时候并没有对toy.print操作进行Lowering，所以这里优先将toy.print进行Lowering。我们把toy.print Lowering到一个非仿射循环嵌套，它为每个元素调用printf。Dialect转换框架支持传递Lowering，不需要直接Lowering为LLVM Dialect。通过应用传递Lowering可以应用多种模式来使得操作合法化（合法化的意思在这里指的就是完全Lowering到LLVM Dialect）。 传递Lowering在这里体现为将toy.print先Lowering到循环嵌套Dialect里面，而不是直接Lowering为LLVM Dialect。 
在Lowering过程中，printf的声明在mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp中，代码如下： 
	/// Return a symbol reference to the printf function, inserting it into the
  /// module if necessary.
  static FlatSymbolRefAttr getOrInsertPrintf(PatternRewriter &rewriter,
                                             ModuleOp module) 
    auto *context = module.getContext();
    if (module.lookupSymbol<LLVM::LLVMFuncOp>("printf"))
      return SymbolRefAttr::get(context, "printf");

    // Create a function declaration for printf, the signature is:
    //   * `i32 (i8*, ...)`
    auto llvmI32Ty = IntegerType::get(context, 32);
    auto llvmI8PtrTy = LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
    auto llvmFnType = LLVM::LLVMFunctionType::get(llvmI32Ty, llvmI8PtrTy,
                                                  /*isVarArg=*/true);

    // Insert the printf function into the body of the parent module.
    PatternRewriter::InsertionGuard insertGuard(rewriter);
    rewriter.setInsertionPointToStart(module.getBody());
    rewriter.create<LLVM::LLVMFuncOp>(module.getLoc(), "printf", llvmFnType);
    return SymbolRefAttr::get(context, "printf");
  
 
这部分代码返回了printf函数的符号引用，必要时将其插入Module。在函数中，为printf创建了函数声明，然后将printf函数插入到父Module的主体中。 
2. 确定Lowering过程需要的所有组件 
第一个需要确定的是转换目标(ConversionTarget)，对于这个Lowering我们除了顶层的Module将所有的内容都Lowering为LLVM Dialect。这里代码表达的信息和官方文档有一些出入，以最新的代码为准。 
// The first thing to define is the conversion target. This will define the
// final target for this lowering. For this lowering, we are only targeting
// the LLVM dialect.
LLVMConversionTarget target(getContext());
target.addLegalOp<ModuleOp>();
 
然后需要确定类型转换器(Type Converter)，我们现存的MLIR表达式还有MemRef类型，我们需要将其转换为LLVM的类型。为了执行这个转化，我们使用TypeConverter作为Lowering的一部分。这个转换器指定一种类型如何映射到另外一种类型。由于现存的操作中已经不存在任何Toy Dialect操作，因此使用MLIR默认的转换器就可以满足需求。定义如下： 
// During this lowering, we will also be lowering the MemRef types, that are
  // currently being operated on, to a representation in LLVM. To perform this
  // conversion we use a TypeConverter as part of the lowering. This converter
  // details how one type maps to another. This is necessary now that we will be
  // doing more complicated lowerings, involving loop region arguments.
  LLVMTypeConverter typeConverter(&getContext());
 
再然后还需要确定转换模式(Conversion Patterns)。这部分代码为： 
// Now that the conversion target has been defined, we need to provide the
  // patterns used for lowering. At this point of the compilation process, we
  // have a combination of `toy`, `affine`, and `std` operations. Luckily, there
  // are already exists a set of patterns to transform `affine` and `std`
  // dialects. These patterns lowering in multiple stages, relying on transitive
  // lowerings. Transitive lowering, or A->B->C lowering, is when multiple
  // patterns must be applied to fully transform an illegal operation into a
  // set of legal ones.
  RewritePatternSet patterns(&getContext());
  populateAffineToStdConversionPatterns(patterns);
  populateLoopToStdConversionPatterns(patterns);
  populateMemRefToLLVMConversionPatterns(typeConverter, patterns);
  populateStdToLLVMConversionPatterns(typeConverter, patterns);

  // The only remaining operation to lower from the `toy` dialect, is the
  // PrintOp.
  patterns.add<PrintOpLowering>(&getContext());
 
上面这段代码展示了为Affine Dialect，Standard Dialect以及遗留的toy.print定义匹配重写规则。首先将Affine Dialect下降到Standard Dialect，即populateAffineToStdConversionPatterns。然后将Loop(针对的是toy.print操作，它已经Lowering到了循环嵌套Dialect)下降到Standard Dialect，即populateLoopToStdConversionPatterns。最后，将Standard Dialect转换到LLVM Dialect，即populateMemRefToLLVMConversionPatterns。以及不要忘了把toy.print的Lowering模式PrintOpLowering加到patterns里面。 
3. 完全Lowering 
定义了Lowering过程需要的所有组件之后，就可以执行完全Lowering了。使用applyFullConversion(module, target, std::move(patterns))) 函数可以保证转换的结果只存在合法的操作，上一篇部分Lowering的笔记调用的是mlir::applyPartialConversion(function, target, patterns)可以对比着看一下。 
// We want to completely lower to LLVM, so we use a `FullConversion`. This
  // ensures that only legal operations will remain after the conversion.
  auto module = getOperation();
  if (failed(applyFullConversion(module, target, std::move(patterns))))
    signalPassFailure();
 
4. 将上面定义好的完全Lowering的Pass加到Pipline中 
这段代码在mlir/examples/toy/Ch6/toyc.cpp中： 
if (isLoweringToLLVM) 
  // Finish lowering the toy IR to the LLVM dialect.
  pm.addPass(mlir::toy::createLowerToLLVMPass());
 
 
这段代码在优化Pipline中添加了mlir::toy::createLowerToLLVMPass()这个完全Lowering的Pass，可以把MLIR 表达式下降为LLVM Dialect表达式。我们运行一下示例程序看下结果： 
执行下面的命令： 
cd llvm-project/build/bin
./toyc-ch6 ../../mlir/test/Examples/Toy/Ch6/llvm-lowering.mlir -emit=mlir-llvm
 
即获得了完全Lowering之后的MLIR表达式，结果比较长，这里只展示一部分。可以看到目前MLIR表达式已经完全在LLVM Dialect空间下了。 
llvm.func @free(!llvm<"i8*">)
llvm.func @printf(!llvm<"i8*">, ...) -> i32
llvm.func @malloc(i64) -> !llvm<"i8*">
llvm.func @main() 
  %0 = llvm.mlir.constant(1.000000e+00 : f64) : f64
  %1 = llvm.mlir.constant(2.000000e+00 : f64) : f64

  ...

^bb16:
  %221 = llvm.extractvalue %25[0 : index] : !llvm<" double*, i64, [2 x i64], [2 x i64] ">
  %222 = llvm.mlir.constant(0 : index) : i64
  %223 = llvm.mlir.constant(2 : index) : i64
  %224 = llvm.mul %214, %223 : i64
  %225 = llvm.add %222, %224 : i64
  %226 = llvm.mlir.constant(1 : index) : i64
  %227 = llvm.mul %219, %226 : i64
  %228 = llvm.add %225, %227 : i64
  %229 = llvm.getelementptr %221[%228] : (!llvm."double*">, i64) -> !llvm<"f64*">
  %230 = llvm.load %229 : !llvm<"double*">
  %231 = llvm.call @printf(%207, %230) : (!llvm<"i8*">, f64) -> i32
  %232 = llvm.add %219, %218 : i64
  llvm.br ^bb15(%232 : i64)

  ...

^bb18:
  %235 = llvm.extractvalue %65[0 : index] : !llvm<" double*, i64, [2 x i64], [2 x i64] ">
  %236 = llvm.bitcast %235 : !llvm<"double*"> to !llvm<"i8*">
  llvm.call @free(%236) : (!llvm<"i8*">) -> ()
  %237 = llvm.extractvalue %45[0 : index] : !llvm<" double*, i64, [2 x i64], [2 x i64] ">
  %238 = llvm.bitcast %237 : !llvm<"double*"> to !llvm<"i8*">
  llvm.call @free(%238) : (!llvm<"i8*">) -> ()
  %239 = llvm.extractvalue %25[0 : index] : !llvm<" double*, i64, [2 x i64], [2 x i64] ">
  %240 = llvm.bitcast %239 : !llvm<"double*"> to !llvm<"i8*">
  llvm.call @free(%240) : (!llvm<"i8*">) -> ()
  llvm.return

 
0x2. 代码生成以及Jit执行 
我们可以使用JIT编译引擎来运行上面得到的LLVM Dialect IR，获得推理结果。这里我们使用了mlir::ExecutionEngine基础架构来运行LLVM Dialect IR。程序位于：mlir/examples/toy/Ch6/toyc.cpp。 
int runJit(mlir::ModuleOp module) 
  // Initialize LLVM targets.
  llvm::InitializeNativeTarget();
  llvm::InitializeNativeTargetAsmPrinter();

  // Register the translation from MLIR to LLVM IR, which must happen before we
  // can JIT-compile.
  mlir::registerLLVMDialectTranslation(*module->getContext());

  // An optimization pipeline to use within the execution engine.
  auto optPipeline = mlir::makeOptimizingTransformer(
      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
      /*targetMachine=*/nullptr);

  // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
  // the module.
  auto maybeEngine = mlir::ExecutionEngine::create(
      module, /*llvmModuleBuilder=*/nullptr, optPipeline);
  assert(maybeEngine && "failed to construct an execution engine");
  auto &engine = maybeEngine.get();

  // Invoke the JIT-compiled function.
  auto invocationResult = engine->invokePacked("main");
  if (invocationResult) 
    llvm::errs() << "JIT invocation failed\\n";
    return -1;
  

  return 0;

 
 
 这里尤其需要注意这行：mlir::registerLLVMDialectTranslation(*module->getContext());。从代码的注释来看这个是将LLVM Dialect表达式翻译成LLVM IR，在JIT编译的时候起到缓存作用，也就是说下次执行的时候不会重复执行上面的各种MLIR表达式变换。 
 
这里创建一个MLIR执行引擎mlir::ExecutionEngine来运行表达式中的main函数。可以使用下面的命令来输出最终的计算结果： 
cd llvm-project/build/bin
./toyc-ch6 ../../mlir/test/Examples/Toy/Ch6/codegen.toy -emit=jit -opt
 
结果为： 
1.000000 16.000000 
4.000000 25.000000 
9.000000 36.000000
 
到这里，我们就将原始的MLIR表达式经过一系列Pass进行优化，以及部分Lowering到三种Dialect混合的表达式，和完全Lowering为LLVM Dialect表达式，最后翻译到LLVM IR使用MLIR的Jit执行引擎进行执行，获得了最终结果。 
另外，mlir/examples/toy/Ch6/toyc.cpp中还提供了一个dumpLLVMIR函数，可以将MLIR表达式翻译成LLVM IR表达式。然后再经过LLVM IR的优化处理。使用如下命令可以打印出生成的LLVM IR： 
$ cd llvm-project/build/bin
$ ./toyc-ch6 ../../mlir/test/Examples/Toy/Ch6/codegen.toy -emit=llvm -opt
 
0x3. 总结 
这篇文章介绍了如何将部分Lowering之后的MLIR表达式进一步完全Lowering到LLVM Dialect上，然后通过JIT编译引擎来执行代码并获得推理结果，另外还可以输出LLVM Dialect生成的LLVM IR。
以上是关于深度学习部署笔记(十五): CUDA_Run_Time_API_parallel_多流并行，以及多流之间互相同步等待的操作方式的主要内容，如果未能解决你的问题，请参考以下文章