Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 3
Posted cuancuancuanhao
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 3相关的知识,希望对你有一定的参考价值。
? 第二章,几个简单的程序
● 代码,单线程
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <sys/time.h> 5 6 #define SIZE (1024*1024) 7 #define MAXFLOP_ITER 100000000 8 #define LOOP_COUNT 128 9 #define FLOP_PER_CALC 2 10 11 float fa[SIZE] __attribute__((align(64))); 12 float fb[SIZE] __attribute__((align(64))); 13 14 double dtime() 15 { 16 struct timeval mytime; 17 gettimeofday(&mytime, (struct timezone*)0); 18 return (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); 19 } 20 21 int main(int argc, char *argv[]) 22 { 23 const float a = 1.1; 24 25 printf("Initializing "); 26 for (int i = 0; i < SIZE; i++) 27 { 28 fa[i] = (float)i + 0.1; 29 fb[i] = (float)i + 0.2; 30 } 31 32 printf("Starting Compute "); 33 double time_b, time_e; 34 time_b = dtime(); 35 for (int j = 0; j < MAXFLOP_ITER; j++) 36 { 37 for (int k = 0; k < LOOP_COUNT; k++) 38 fa[k] = a * fa[k] + fb[k]; 39 } 40 time_e = dtime(); 41 42 double gflops = 1.0e-9 * LOOP_COUNT * MAXFLOP_ITER * FLOP_PER_CALC; 43 printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf ", gflops, time_e - time_b, gflops / (time_e - time_b)); 44 45 return 0; 46 }
● 输出结果
GFlops = 25.600, Secs = 1.464, GFlops per sec = 17.484
● 单核心两线程的 OpenMP
1 int main(int argc, char *argv[]) 2 { 3 const float a = 1.1; 4 int i, j, k, numthreads; // 循环变量放到外边来 5 6 omp_set_num_threads(2); // 运行时设置 OpenMP 参数 7 kmp_set_defaults("KMP_AFFINITY=compact"); 8 9 #pragma omp parallel 10 #pragma omp master 11 numthreads = omp_get_num_threads(); 12 13 printf("Initializing "); 14 #pragma omp parallel for 15 for (i = 0; i < SIZE; i++) 16 { 17 fa[i] = (float)i + 0.1; 18 fb[i] = (float)i + 0.2; 19 } 20 printf("Starting Compute on %d threads ", numthreads); 21 double time_b, time_e; 22 time_b = dtime(); 23 #pragma omp parallel for private(j, k) 24 for (i = 0; i < numthreads; i++) 25 { 26 int offset = i * LOOP_COUNT; 27 for (j = 0; j < MAXFLOP_ITER; j++) 28 { 29 for (k = 0; k < LOOP_COUNT; k++) 30 fa[k + offset] = a * fa[k + offset] + fb[k + offset]; 31 } 32 } 33 time_e = dtime(); 34 35 double gflops = 1.0e-9 * numthreads * LOOP_COUNT * MAXFLOP_ITER * FLOP_PER_CALC; 36 printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf ", gflops, time_e - time_b, gflops / (time_e - time_b)); 37 38 return 0; 39 }
● 输出结果
1 GFlops = 51.200, Secs = 1.464, GFlops per sec = 34.968
● 线程数、线程亲缘性调整
1 // 替换 2 omp_set_num_threads(2); 3 kmp_set_defaults("KMP_AFFINITY=compact"); 4 // 替换为 5 omp_set_num_threads(112); 6 kmp_set_defaults("KMP_AFFINITY=scatter");
● 输出结果
GFlops = 2867.200, Secs = 1.619, GFlops per sec = 1771.298
● 代码,带宽测试
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <sys/time.h> 5 #include <omp.h> 6 7 #define REAL double 8 #define SIZE (1000*1000*64) 9 #define MAXFLOP_ITER 1000 10 #define FLOP_PER_CALC 2 11 12 REAL fa[SIZE] __attribute__((align(64))); 13 REAL fb[SIZE] __attribute__((align(64))); 14 REAL fc[SIZE] __attribute__((align(64))); 15 16 double dtime() 17 { 18 struct timeval mytime; 19 gettimeofday(&mytime, (struct timezone*)0); 20 return (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); 21 } 22 23 int main(int argc, char *argv[]) 24 { 25 const REAL a = 1.1; 26 int i, j; 27 28 omp_set_num_threads(112); 29 kmp_set_defaults("KMP_AFFINITY=scatter"); 30 31 printf("Initializing "); 32 #pragma omp parallel for 33 for (i = 0; i < SIZE; i++) 34 { 35 fa[i] = (REAL)i + 0.1; 36 fb[i] = (REAL)i + 0.2; 37 } 38 39 #pragma omp parallel 40 #pragma omp master 41 printf("Starting BW Test on %d threads ", omp_get_num_threads()); 42 double time_b, time_e; 43 time_b = dtime(); 44 for (i = 0; i < MAXFLOP_ITER; i++) 45 { 46 #pragma omp parallel for 47 for (j = 0; j < SIZE; j++) 48 fa[j] = fb[j]; 49 } 50 time_e = dtime(); 51 double gbytes = 1.0e-9 * MAXFLOP_ITER * SIZE * FLOP_PER_CALC * sizeof(REAL); 52 printf("Gbytes = %10.3lf, Secs = %10.3lf, GBytes per sec = %10.3lf ", gbytes, time_e - time_b, gbytes / (time_e - time_b)); 53 54 return 0; 55 }
● 输出结果
Starting BW Test on 112 threads Gbytes = 1024.000, Secs = 10.293, GBytes per sec = 99.488
● 代码,offload 模式
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <sys/time.h> 5 #include <omp.h> 6 7 #define SIZE (1024*512) 8 #define MAXFLOP_ITER 100000000 9 #define LOOP_COUNT 128 10 #define FLOP_PER_CALC 2 11 12 __declspec (target(mic)) float fa[SIZE] __attribute__((align(64))); // 声明 mic 上的存储类型 13 __declspec (target(mic)) float fb[SIZE] __attribute__((align(64))); 14 15 double dtime() 16 { 17 struct timeval mytime; 18 gettimeofday(&mytime, (struct timezone*)0); 19 return (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); 20 } 21 22 int main(int argc, char *argv[]) 23 { 24 const float a = 1.1; 25 int i, j, k, numthreads; 26 27 omp_set_num_threads(112); 28 kmp_set_defaults("KMP_AFFINITY=scatter"); 29 #pragma offload target (mic)// 声明需要使用 mic 的 offload 模式 30 #pragma omp parallel 31 #pragma omp master 32 numthreads = omp_get_num_threads(); 33 34 printf("Initializing "); 35 #pragma omp parallel for 36 for (i = 0; i<SIZE; i++) 37 { 38 fa[i] = (float)i + 0.1; 39 fb[i] = (float)i + 0.2; 40 } 41 printf("Starting Compute on %d threads ", numthreads); 42 double time_b, time_e; 43 time_b = dtime(); 44 #pragma offload target (mic)// 声明需要使用 mic 的 offload 模式 45 #pragma omp parallel for private(j, k) 46 for (i = 0; i<numthreads; i++) 47 { 48 int offset = i * LOOP_COUNT; 49 for (j = 0; j < MAXFLOP_ITER; j++) 50 { 51 #pragma vector aligned// 强制向量对齐 52 for (k = 0; k < LOOP_COUNT; k++) 53 fa[k + offset] = a * fa[k + offset] + fb[k + offset]; 54 } 55 } 56 time_e = dtime(); 57 58 double gflops = 1.0e-9 * numthreads * LOOP_COUNT * MAXFLOP_ITER * FLOP_PER_CALC; 59 printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf ", gflops, time_e - time_b, gflops / (time_e - time_b)); 60 61 return 0; 62 }
● 编译时找不到动态库,调整 /etc/ld.so.conf 解决掉其中一个(liboffload.so.5),但是剩下的几个无论怎么添加修改都没法解决,报错信息如下,留个坑。注意所有相关的动态库在 /usr/local/intel/composer_xe_2015.0.090/compiler/lib/mic 和 /usr/local/intel/composer_xe_2015.0.090/compiler/lib/intel64 下各有一份,文件名相同但两者不相互兼容,往外搬的时候主要需要的是哪个版本。
x86_64-k1om-linux-ld: warning: libimf.so, needed by /usr/local/intel/composer_xe_2015.0.090/compiler/lib/mic/liboffload.so.5, not found (try using -rpath or -rpath-link) x86_64-k1om-linux-ld: warning: libsvml.so, needed by /usr/local/intel/composer_xe_2015.0.090/compiler/lib/mic/liboffload.so.5, not found (try using -rpath or -rpath-link) x86_64-k1om-linux-ld: warning: libirng.so, needed by /usr/local/intel/composer_xe_2015.0.090/compiler/lib/mic/liboffload.so.5, not found (try using -rpath or -rpath-link) x86_64-k1om-linux-ld: warning: libintlc.so.5, needed by /usr/local/intel/composer_xe_2015.0.090/compiler/lib/mic/liboffload.so.5, not found (try using -rpath or -rpath-link)
以上是关于Xeon Phi 《协处理器高性能编程指南》随书代码整理 part 3的主要内容,如果未能解决你的问题,请参考以下文章
如果在 Xeon Phi 上编译时不知道循环计数,则性能下降
如何检测 Xeon Phi (Knights Landing)
我可以在 Xeon Phi (Knights Landing) 处理器上编译 Go 程序吗?