为啥 OMP 任务运行速度比 OMP 慢?

Posted

技术标签:

【中文标题】为啥 OMP 任务运行速度比 OMP 慢?【英文标题】:Why does the OMP task run slower than OMP for?为什么 OMP 任务运行速度比 OMP 慢? 【发布时间】:2018-01-06 17:16:10 【问题描述】:

我是OPENMP新手,想用它来解波动方程,序列号在这里:

#include <time.h>
#include <stdio.h>
#include <omp.h>
#include <math.h>

#define GRID_SZ 3000
#define ARR_SZ GRID_SZ * GRID_SZ
#define PEAK_SZ 31

double *process_withoutomp() 
    double start = omp_get_wtime();
    int i, j;
    double dt = 0.04, C = 16, K = 0.1, h = 6;
    double *data, *olddata, *newdata, *tmp;
    double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
    data = (double*)malloc(sizeof(double)*ARR_SZ);
    olddata = (double*)malloc(sizeof(double)*ARR_SZ);
    newdata = (double*)malloc(sizeof(double)*ARR_SZ);

    for(i = 0; i < ARR_SZ; i++)
            data[i] = 1.0;
    

    for(i = 0; i < PEAK_SZ; i++)
            linspace[i] = -1.0 + delta * i;
    

    for(i = 0; i < PEAK_SZ; i++)
            for(j = 0; j < PEAK_SZ; j++)
                    x[i][j] = linspace[i];
            
    

    for(i = 0; i < PEAK_SZ; i++)
            for(j = 0; j < PEAK_SZ; j++)
                    data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
            
    

    for(i = 0; i < ARR_SZ; i++)
            olddata[i] = data[i];
    

    for(i = 0; i < 20; i++)
            sequential_update_withoutomp( data, olddata, newdata, C, K, dt);
            tmp = olddata;
            olddata = data;
            data = newdata;
            newdata = tmp;
    
    double end = omp_get_wtime();
    printf("without omp spend: %f\n",end-start);

    return data;void sequential_update_withoutomp(double *data, double *olddata, double *newdata, double C, double K, double dt )
    int i, j, add_i, sub_i, add_j, sub_j;
    double pot;
    for( i = 0; i < GRID_SZ; i++)
            for( j = 0; j < GRID_SZ; j++)
                    add_i = i+1 >= GRID_SZ ? i : i+1;
                    add_j = j+1 >= GRID_SZ ? j : j+1;
                    sub_i = i-1 < 0 ? 0 : i-1;
                    sub_j = j-1 < 0 ? 0 : j-1;
                    pot = data[add_i*GRID_SZ+j]+
                                data[sub_i*GRID_SZ+j]+
                                data[add_j+i*GRID_SZ]+
                                data[sub_j+i*GRID_SZ]-
                                4*data[i*GRID_SZ+j];
                    newdata[i * GRID_SZ + j] = 
                            ( pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] *(2 - K * dt) ) / (2 + K * dt);
            
    

这是使用的版本:

double *process_withomp() 
    double start = omp_get_wtime();

    int i, j;
    double dt = 0.04, C = 16, K = 0.1, h = 6;
    double *data, *olddata, *newdata, *tmp;
    double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
    data = (double*)malloc(sizeof(double)*ARR_SZ);
    olddata = (double*)malloc(sizeof(double)*ARR_SZ);
    newdata = (double*)malloc(sizeof(double)*ARR_SZ);

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++)
            data[i] = 1.0;
    

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++)
        linspace[i] = -1.0 + delta * i;
        for(j = 0; j < PEAK_SZ; j++) 
            x[i][j] = linspace[i];
        
    

    #pragma omp barrier

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++)
        for(j = 0; j < PEAK_SZ; j++)
            data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
        
    

    #pragma omp barrier

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++)
        olddata[i] = data[i];
    

    #pragma omp barrier

    for(i = 0; i < 20; i++) 
        sequential_update_withomp( data, olddata, newdata, C, K, dt);
        tmp = olddata;
        olddata = data;
        data = newdata;
        newdata = tmp;
    


    double end = omp_get_wtime();
    printf("with omp spend: %f\n",end-start);
    return data;void sequential_update_withomp(double *data, double *olddata, double *newdata, double C, double K, double dt ) 
    int i, j;
    double pot;
    #pragma omp parallel for private(i,j,pot) schedule(auto)
    for( i = 0; i < GRID_SZ; i++) 
        for( j = 0; j < GRID_SZ; j++) 
            pot = data[(i+1 >= GRID_SZ ? i : i+1)*GRID_SZ+j]+
                data[(i-1 < 0 ? 0 : i-1)*GRID_SZ+j]+
                data[(j+1 >= GRID_SZ ? j : j+1)+i*GRID_SZ]+
                data[(j-1 < 0 ? 0 : j-1)+i*GRID_SZ]
                -4*data[i*GRID_SZ+j];
            newdata[i * GRID_SZ + j] = 
                        (pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] 
                        * (2 - K * dt)) 
                        / (2 + K * dt);
        
    

这个版本运行良好,但是当我尝试使用task替换它时,结果是正确的,但是时间花费更多:

double *process_withomp1() 
    double start = omp_get_wtime();

    int i, j;
    double dt = 0.04, C = 16, K = 0.1, h = 6;
    double *data, *olddata, *newdata, *tmp;
    double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
    data = (double*)malloc(sizeof(double)*ARR_SZ);
    olddata = (double*)malloc(sizeof(double)*ARR_SZ);
    newdata = (double*)malloc(sizeof(double)*ARR_SZ);

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++)
            data[i] = 1.0;
    

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++)
        linspace[i] = -1.0 + delta * i;
        for(j = 0; j < PEAK_SZ; j++) 
            x[i][j] = linspace[i];
        
    

    #pragma omp barrier

    #pragma omp parallel for private(i,j) schedule(auto)
    for(i = 0; i < PEAK_SZ; i++)
        for(j = 0; j < PEAK_SZ; j++)
            data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
        
    

    #pragma omp barrier

    #pragma omp parallel for private(i) schedule(auto)
    for(i = 0; i < ARR_SZ; i++)
        olddata[i] = data[i];
    

    #pragma omp barrier

    for(i = 0; i < 20; i++) 
        sequential_update_withomp1( data, olddata, newdata, C, K, dt);
        tmp = olddata;
        olddata = data;
        data = newdata;
        newdata = tmp;
    


    double end = omp_get_wtime();
    printf("with omp spend: %f\n",end-start);
    return data;
void sequential_update_withomp1(double *data, double *olddata, double *newdata, double C, double K, double dt ) 
    int i, j;
    double pot;
    #pragma omp parallel private(i,j,pot)
    for( i = 0; i < GRID_SZ; i++) 
        for( j = 0; j < GRID_SZ; j++) 
            #pragma omp task
            
            pot = data[(i+1 >= GRID_SZ ? i : i+1)*GRID_SZ+j]+
                    data[(i-1 < 0 ? 0 : i-1)*GRID_SZ+j]+
                    data[(j+1 >= GRID_SZ ? j : j+1)+i*GRID_SZ]+
                    data[(j-1 < 0 ? 0 : j-1)+i*GRID_SZ]
                    -4*data[i*GRID_SZ+j];
            newdata[i * GRID_SZ + j] = 
                            (pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] 
                            * (2 - K * dt)) 
                            / (2 + K * dt);
            
        
    

在我的mac中,serial版本需要7.7s左右,for版本需要3.7s,但是task使用53s。

有人知道这里出了什么问题吗?

提前致谢

【问题讨论】:

【参考方案1】:

这里有两点需要考虑:

a) 线程粒度,即每个线程的工作量 b)任务创建方式

在您的代码中,a) 太小,b) 已损坏。

a) 在您的 task 示例中,内循环的一次迭代是一项任务,而在 parallel for 示例中,外循环的 n 次迭代是并行的,即每个线程处理外循环的一大块迭代。使用schedule(static, 1),一个外部迭代将是每个线程的工作大小。请记住,所有并行性都会增加开销,用于同步内容、簿记等。增加的成本必须通过提高并行执行的执行速度来补偿。找到合适的工作量是至关重要的,你需要尽可能多的工作来保持一切忙碌,也许还需要更多的工作来给调度程序一些空间来补偿任务/块之间的负载不平衡,但尽可能少地保持开销小。

b) 在并行区域中运行循环意味着每个线程都在运行整个循环嵌套并多次创建所有任务。这就像并行运行串行程序多次一样。

void sequential_update_withomp1(double *data, double *olddata, double *newdata, double C, double K, double dt ) 
// ....
#pragma omp parallel private(i,j,pot)

    // split loop among threads of parallel region
    // i.e. create tasks in parallel
    #pragma omp for
    for( i = 0; i < GRID_SZ; i++) 
        // coarse grained tasks (as in parallel for version)
        #pragma omp task
        
            // each inner for loop is one task
            for( j = 0; j < GRID_SZ; j++) 
                // ...
            
         // task
     // parallel for
 // parallel region

这给了我(2 个内核 x 2 个超线程):

serial:        4.839213
parallel for:  2.529813
task:          2.817615

注意:在这里实际使用任务没有意义,因为它们只会在并行 for 循环之上增加开销。

【讨论】:

感谢您的帮助,我已经在这里修复了我的代码:github.com/langker/Wave-Equation-2D/blob/master/unit_test.c 但是在我的最新版本代码中,任务版本仍然比 for... 在我的 MAC(2.7 GHz,Intel,Core i5)中,当 GRID_SZ=3000 时,任务版本是比版本慢 0.3 秒。

以上是关于为啥 OMP 任务运行速度比 OMP 慢?的主要内容,如果未能解决你的问题,请参考以下文章

OpenMP 程序比顺序程序慢

具有嵌套任务的 C# 代码比仅在顶层具有任务的相同代码运行速度慢

在不使用 OMP TASK 的情况下如何做到这一点?

在 OpenMP 中,我们如何并行运行多个代码块,其中每个代码块包含 omp single 和 omp for 循环?

为啥此 SIMD 代码运行速度比等效标量慢?

为啥带有'exists'的sql运行速度比使用MySQL的'in'慢