使用 PBLAS 算法进行矩阵计算 - 使用 MPI 的多线程
Posted
技术标签:
【中文标题】使用 PBLAS 算法进行矩阵计算 - 使用 MPI 的多线程【英文标题】:Matrix calculation with PBLAS algorithm - multi threads with MPI 【发布时间】:2014-01-08 03:01:24 【问题描述】:我目前正在使用 mpi 开发用于矩阵乘法的 C 代码。我已经将函数实现为在其他文件中定义的 mult 或 multadd,运行良好。
但是我的文件pblas.c
可以编译,但运行时崩溃。
我在安装了mli
的大学服务器上运行我的项目。
我的pblas
代码哪里错了?
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "commfct.h"
#include "toolsfct.h"
void usage()
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
int main(int argc, char **argv)
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2)
usage();
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc)
if (me == 0)
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
exit(1);
createGridComm(me,P,Q);
ligMe = me / Q;
colMe = me % Q;
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++)
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<P;i++)
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i)
if(ligMe==colMe)
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,ligMe,commCol);
multadd(A,B,C,blockSize);
else
int dest = colMe * Q + ligMe;
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,dest%Q,commCol);
mult(A,Btmp,CC,blockSize);
else
int dest = colMe*Q + ligMe;
if(dest%Q == i)
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,colMe,commCol);
multadd(A,Btmp,C,blockSize);
else
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
mult(A,Btmp,CC,blockSize);
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++)
sum += 2.0+(ligMe*Q)+(double)i;
for(i=0;i<blockSize;i++)
for(j=0;j<blockSize;j++)
if (C[i+j*blockSize] != ((double)me/10.0 + sum*blockSize*(colMe+1.0)))
correct = 0;
if (correct != 1)
printf("multiplication result is not correct\n");
// free memory
free(A);
free(B);
free(C);
free(CC);
releaseGridComm();
MPI_Finalize();
return 0;
【问题讨论】:
你知道它崩溃在哪一行吗? 更多关于您尝试过的指导。确切的错误是什么以及它发生的位置等是需要的。如果你不让它变得更容易,在这里仅仅调试其他人的代码并不是很有趣。 【参考方案1】:问题可能来自 MPI_Send()、MPI_Recv() 或 MPI_Bcast() 中的索引。例如,在 MPI_Send() 中,目的地应调用 MPI_Recv()。在 MPI_Recv() 中,源端应该调用 MPI_Send()。否则,您将陷入僵局。代码继续运行,但它等待消息。
从你给我们的来看,我猜你有两个矩阵 A 和 B。
A0 | A1
...........
A2 | A3
要计算 C 的第一列,您需要:
A0xB0 | A1xB2
.......
A2xB0 | A3xB2
我更改了您的代码,以便:
对于每一列 i
B的i_th列转置到Btmp的i_th行
第i_行Btmp 广播到每列的Btmp。
由于问题是关于 MPI 的,我回复了一个 MPI 代码,它看起来非常接近您的代码,可以正常工作,但除了通信操作之外,它什么也没做...
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "mpi.h"
//#include "commfct.h"
//#include "toolsfct.h"
#define TAG_TRANSPOSE 42
void usage()
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
int main(int argc, char **argv)
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2)
usage();
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc)
if (me == 0)
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
exit(1);
//createGridComm(me,P,Q);
colMe = me / Q;
ligMe = me % Q;
MPI_Comm commCol, commLine;
//comes from http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
/* Split comm into row and column comms */
MPI_Comm_split(MPI_COMM_WORLD, ligMe, colMe, &commLine);
/* color by row, rank by column */
MPI_Comm_split(MPI_COMM_WORLD, colMe, ligMe, &commCol);
/* color by column, rank by row */
printf("[%d]:My coordinates are i j (%d,%d)\n",me,ligMe,colMe);
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++)
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
// t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<Q;i++)
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i)
if(ligMe==colMe)
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,B,C,blockSize);
else
int dest = ligMe * Q + i;//transpose !
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
else
int from = i*Q + colMe;// transpose !
if(ligMe == i)
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,from,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,Btmp,C,blockSize);
else
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
//t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++)
sum += 2.0+(ligMe*Q)+(double)i;
for(i=0;i<blockSize;i++)
for(j=0;j<blockSize;j++)
if (C[i+j*blockSize] <0.99999*((double)me/10.0 + sum*blockSize*(colMe+1.0)) || C[i+j*blockSize] >1.00001*((double)me/10.0 + sum*blockSize*(colMe+1.0)) )
correct = 0;
if (correct != 1)
printf("multiplication result is not correct\n");
// free memory
free(A);
free(B);
free(C);
free(CC);
//releaseGridComm();
MPI_Finalize();
return 0;
注意 createGridComm(me,P,Q);我必须在http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html找到等效的东西@
我还更改了代码末尾的测试。测试双精度数之间的相等性将过于严格。不等式更适合浮点数!
我希望这会有所帮助! 再见,
弗朗西斯
【讨论】:
以上是关于使用 PBLAS 算法进行矩阵计算 - 使用 MPI 的多线程的主要内容,如果未能解决你的问题,请参考以下文章
sklearn使用投票器VotingClassifier算法构建多模型融合的硬投票器分类器(hard voting)并计算融合模型的混淆矩阵可视化混淆矩阵(confusion matrix)