使用 openmp 优化 N-queen

Posted 2023-02-19

技术标签:

【中文标题】使用 openmp 优化 N-queen【英文标题】：Optimizing N-queen with openmp 【发布时间】：2013-10-05 09:30:46 【问题描述】：

我正在学习 OPENMP 并编写了以下代码来解决 nqueens 问题。

//Full Code: https://github.com/Shafaet/Codes/blob/master/OPENMP/Parallel%20N-  Queen%20problem.cpp
int n;

int call(int col,int rowmask,int dia1,int dia2)

    if(col==n) 
    
        return 1;

    
    int row,ans=0;
    for(row=0;row<n;row++)
    
        if(!(rowmask & (1<<row)) & !(dia1 & (1<<(row+col))) & !(dia2 & (1<<((row+n-1)-col))))
                   
            ans+=call(col+1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
        
    
    return ans;



double parallel()

    double st=omp_get_wtime();
    int ans=0;
    int i;
    int rowmask=0,dia1=0,dia2=0;
     #pragma omp parallel for reduction(+:ans) shared(i,rowmask)
    for(i=0;i<n;i++)
    
        rowmask=0;
        dia1=0,dia2=0;
        int col=0,row=i;
        ans+=call(1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
    
    printf("Found %d configuration for n=%d\n",ans,n);
    double en=omp_get_wtime();
    printf("Time taken using openmp %lf\n",en-st);
    return en-st;


double serial()


    double st=omp_get_wtime();
    int ans=0;
    int i;
    int rowmask=0,dia1=0,dia2=0;
    for(i=0;i<n;i++)
    
        rowmask=0;
        dia1=0,dia2=0;
        int col=0,row=i;
        ans+=call(1,rowmask|1<<row,dia1|(1<<(row+col)), dia2|(1<<((row+n-1)-col)));
    
    printf("Found %d configuration for n=%d\n",ans,n);
    double en=omp_get_wtime();
    printf("Time taken without openmp %lf\n",en-st);
    return en-st;


int main()

    double average=0;
    int count=0;
    for(int i=2;i<=13;i++)
    
        count++;
        n=i;

        double stime=serial();
        double ptime=parallel();
        printf("OpenMP is %lf times faster for n=%d\n",stime/ptime,n);
        average+=stime/ptime;
        puts("===============");
    
    printf("On average OpenMP is %lf times faster\n",average/count);
    return 0;

并行代码已经比普通代码快，但我想知道如何使用 openmp pragma 对其进行更多优化。我想知道为了更好的性能我应该做什么，不应该做什么。

提前致谢。

（请不要提出任何与并行编程无关的优化）

【问题讨论】：

您可以查看 RosettaCode 上的 this code。我首先在 F77 中编写它，然后将其改编为使用 OpenMP。它只使用“并行”，就像你的一样。但老实说，如果您不更改算法，除了在多个内核上并行运行（应该已经通过并行运行完成）之外，对 OpenMP 有什么期望？如果您只是学习 OpenMP，那么您显然需要了解 private 和 shared。 i、rowmask、dia1 和 dia2 应该是 private。因为i 是一个迭代器，所以无论如何它都是私有的。在竞态条件下，您将rowmaks、dia1 和dia2 设置为零，然后将它们传递给一个使它们成为私有的函数，因此最终一切正常，这主要是偶然的。 【参考方案1】：

您的代码似乎使用了经典的回溯 N-Queens 递归算法，这对于 N-Queens 求解来说并不是最快的，但（由于简单）在实践方面是最生动的一种具有并行性基础。话虽这么说：这很简单，因此您不会期望它自然地展示许多高级 OpenMP 手段，除了基本的“并行”和归约。

但是，就您正在寻找学习并行性以及可能更清晰和更好的学习曲线而言，还有一个（在许多可能的）实现中可用，它使用相同的算法但从教育的角度来看，往往更具可读性和生动性：

void setQueen(int queens[], int row, int col) 
//check all previously placed rows for attacks
for(int i=0; i<row; i++) 
   // vertical attacks
   if (queens[i]==col) 
       return;
   

   // diagonal attacks
   if (abs(queens[i]-col) == (row-i) ) 
      return;
   


// column is ok, set the queen
queens[row]=col;
if(row==size-1) 
#pragma omp atomic
    nrOfSolutions++;  //Placed final queen, found a solution

else 
     // try to fill next row
     for(int i=0; i<size; i++) 
         setQueen(queens, row+1, i);
     



//Function to find all solutions for nQueens problem on size x size chessboard.
void solve() 
#pragma omp parallel for
    for(int i=0; i<size; i++) 
         // try all positions in first row
         int * queens = new int[size];  //array representing queens placed on a chess board.  Index is row position, value is column.
         setQueen(queens, 0, i);
         delete[](queens);

这个给定的代码是Intel Advisor XE 示例之一（适用于 C++ 和 Fortran）；给定示例的并行化方面在给定Parallel Programming Book 的第 10 章中进行了非常详细的讨论（实际上，给定的章节只是使用 N-Queens 来演示如何使用工具来并行化串行代码一般 em>)。

Given Advisor n-queens 示例使用与您的算法基本相同的算法，但它用简单并行 for + atomic 的组合取代了显式归约。预计此代码效率较低，但更“程序化”和“教育性”，因为它展示了“隐藏”的数据竞争。如果您上传给定的示例代码，您实际上会发现 4 个使用 TBB、Cilk Plus 和 OpenMP（OMP 用于 C++ 和 Fortran）的等效 N-Queens 并行实现。

【讨论】：

【参考方案2】：

我知道我参加聚会有点晚了，但您可以使用任务队列来进一步优化。（结果快了大约 7-10%）。不知道为什么。这是我正在使用的代码：

#include <iostream>  // std::cout, cin, cerr ...
#include <iomanip>   // modify std::out
#include <omp.h>

using namespace std;

int nrOfSolutions=0;
int size=0;

void print(int queens[]) 
  cerr << "Solution " << nrOfSolutions << endl; 
  for(int row=0; row<size; row++) 
    for(int col=0; col<size; col++) 
      if(queens[row]==col) 
  cout << "Q";
      
      else 
  cout << "-";
      
    
    cout << endl;
  


void setQueen(int queens[], int row, int col, int id) 

  for(int i=0; i<row; i++) 
    // vertical attacks
    if (queens[i]==col) 
      return;
    
    // diagonal attacks
    if (abs(queens[i]-col) == (row-i) ) 
      return;
    
  

  // column is ok, set the queen
  queens[row]=col;

  if(row==size-1) 


    // only one thread should print allowed to print at a time
    
      // increasing the solution counter is not atomic
#pragma omp critical
      nrOfSolutions++;
#ifdef _DEBUG
#pragma omp critical
      print(queens);
#endif
    

  
  else 
    // try to fill next row
    for(int i=0; i<size; i++) 
      setQueen(queens, row+1, i, id);
    
  


void solve() 
  int myid=0 ;

#pragma omp parallel
#pragma omp single
  
      for(int i=0; i<size; i++) 
/*
#ifdef _OMP //(???)
  myid = omp_get_thread_num();  
#endif
#ifdef _DEBUG
  cout << "ThreadNum: " << myid << endl ;
#endif
  */
  // try all positions in first row
  // create separate array for each recursion
  // started here
#pragma omp task
    setQueen(new int[size], 0, i, myid);
      
    


int main(int argc, char*argv[]) 

  if(argc !=2) 
    cerr << "Usage: nq-openmp-taskq boardSize.\n";
    return 0;
  

  size = atoi(argv[1]);
  cout << "Starting OpenMP Task Queue solver for size " << size << "...\n";

    double st=omp_get_wtime();
    solve();

    double en=omp_get_wtime();
    printf("Time taken using openmp %lf\n",en-st);

  cout << "Number of solutions: " << nrOfSolutions << endl;

return 0;

【讨论】：

以上是关于使用 openmp 优化 N-queen的主要内容，如果未能解决你的问题，请参考以下文章