OpenMP C ++ Data race с жевром

Question

OpenMP C ++ Data race с жевром

Я пытаюсь распараллелить цикл for в C ++ с openMP. У меня есть много матриц (класса Matrix), которые должны быть возведены в степень с помощью zheevr. Реализация дает Data Race.

Распараллеленный цикл for

/* in main */
#pragma omp parallel for shared(Aexp_omp, A, Z, W) private(j) firstprivate(idt)
for (j=0; j< nMat; ++j) {
Aexp_omp[j] = EigenMethod(A[j],-idt,&(Z[j]),W[j]);
}

Вот, Aexp_omp, A, Z все массивы класса Matrix, Точная линия кажется вторым звонком в жеевр в EigenMethod, Код выполняется должным образом, когда критическое замечание #pragma omp не закомментировано (но установка здесь критического значения pragma omp, по-видимому, лишает цели пареллизирования этого кода).

Я не понимаю, откуда происходит гонка данных, поскольку, если я правильно помню, все объявленные в цикле переменные являются частными для потока?

Любое понимание того, почему это происходит? Я ошибочно объявил pragma omp parallel for часть кода?

Большое спасибо за вашу помощь.

Изменить: полный код

Чтобы предоставить всю информацию, вот полный код (это немного долго, мои извинения). Он компилируется на моих машинах с make-файлом

omp_matrix: omp_matrix.cpp
g++ -fopenmp -lgsl -lgslcblas -llapack -lblas -lm -o omp_matrix omp_matrix.cpp

Это дает случайное поведение при беге из-за, я подозреваю, гонки данных. Код должен содержать файлы, один с основным (и некоторым дополнительным пространством имен) и классом Matrix.

/* omp_matrix.cpp */
#include <iostream>
#include <omp.h>
#include <cstdio>
#include <ctime>
#include <string>
#include <cmath>
#include <vector>
#include <complex>
#include <limits>
#include <cstdlib>

const char Trans[]  = {'N','T','C'};
const char UpLo[]   = {'U','L'};
const char Jobz[]   = {'V','N'};
const char Range[]  = {'A','V','I'};
#ifdef __cplusplus
extern "C" {
#endif

void zgemm_(const char* TransA, const char* TransB, const size_t* M, const size_t* N, const size_t* K, const std::complex<double>* alpha, const std::complex<double>* A, const size_t* lda, const std::complex<double>* B, const size_t* ldb, const std::complex<double>* beta, std::complex<double>* C, size_t* ldc);
int zheevr_(const char* jobz, const char* range, const char* uplo, const size_t* n, std::complex<double>* a, size_t* lda, double* vl, double* vu, size_t* il, size_t* iu, double* abstol, size_t* m, double* w, std::complex<double>* z, size_t* ldz, int* isuppz, std::complex<double>* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info);

#ifdef __cplusplus
}
#endif

#include "Matrix.hpp"
namespace MOs {
//Identity Matrix
template <class T>
inline void Identity(Matrix<T>& I){
size_t dim = I.GetRows();
if (dim != I.GetColumns())
exit(1);
for(size_t i=0; i<dim; i++)
for(size_t j=0; j<dim; j++)
I(i,j) = i == j ? T(1) : T(0);
return;
}

// The H.O. Annilation operator
template <class T>
inline void Destroy(Matrix<T>& a){
size_t dim = a.GetRows();
if (dim != a.GetColumns())
exit(1);
for(size_t i=0; i<dim; i++)
for(size_t j=0; j<dim; j++)
a(i,j) = j == i+1 ? std::sqrt(T(j)) : T(0);
return;
}

//Take the Hermitian conjugate of a complex Matrix
inline Matrix<std::complex<double> > Dagger(const Matrix<std::complex<double> >& A){
size_t cols = A.GetColumns(), rows= A.GetRows();
Matrix<std::complex<double> > temp(cols,rows);
for (size_t i=0; i < rows; i++)
for(size_t j=0; j < cols; j++)
temp(j,i) = conj(A(i,j));
return temp;
}

template <class T>
inline Matrix<T> TensorProduct(const Matrix<T>& A, const Matrix<T>& B){
size_t rows1 = A.GetRows(), rows2 = B.GetRows(), cols1 = A.GetColumns(), cols2 = B.GetColumns();
size_t rows_new = rows1*rows2, cols_new = cols1*cols2, n, m;
Matrix<T> temp(rows_new,cols_new);
for(size_t i=0; i<rows1; i++)
for(size_t j=0; j<cols1; j++)
for(size_t p=0; p<rows2; p++)
for(size_t q=0; q<cols2; q++){
n = i*rows2+p;
m = j*cols2+q; //  0 (0 + 1)  + 1*dimb=2 + (0 + 1 )  (j*dimb+q)
temp(n,m) = A(i,j)*B(p,q);
}
return temp;
}
}

Matrix<std::complex<double> > EigenMethod(const Matrix<std::complex<double> >& Ain,  std::complex<double> x, Matrix<std::complex<double> >* Z=NULL, double *W=NULL );

using namespace std;

int main(int argc, char const *argv[]) {
// dim is the dimension of the system
size_t  dimQ    = 2;
size_t  dim = dimQ*dimQ*dimQ;

// Define the matrices used to build the matrix to be exponentiated
Matrix<complex<double> > o(dimQ,dimQ), od(dimQ,dimQ), nq(dimQ,dimQ);
MOs::Destroy(o);
od = MOs::Dagger(o); nq=od*o;
Matrix<complex<double> > IdentQ(dimQ, dimQ), Hd(dim, dim);
MOs::Identity(IdentQ);
Hd = 0.170*MOs::TensorProduct(MOs::TensorProduct(od,IdentQ),o) + 0.124* MOs::TensorProduct(MOs::TensorProduct(IdentQ,od),o);

complex<double> idt = complex<double>(0.0,0.2);                 // time unit multiplied by i

size_t  nMat    = 2;
double* c   = new double[nMat];
c[0] = 0.134;
c[1] = -0.326;

// Decalre matrices and allocate memory for them
Matrix<complex<double> >*   A    = new Matrix<complex<double> >[nMat];
Matrix<complex<double> >*   Aexp     = new Matrix<complex<double> >[nMat];
Matrix<complex<double> >*   Aexp_omp = new Matrix<complex<double> >[nMat];
for (size_t k = 0; k < nMat; ++k)
A[k] = c[k]*Hd;

// METHOD 1: Serial. Exponentiate one matrix after another
double start_s = omp_get_wtime();
for (size_t k = 0; k < nMat; ++k)
Aexp[k] = EigenMethod(A[k],-idt);
double end_s = omp_get_wtime();

cout << "Aexp[0] = \n" << Aexp[0] << endl;
cout << "Aexp[1] = \n" << Aexp[1] << endl;

// METHOD 2: Parallel. Exponentiate all matrices at the same time
// THIS DOES NOT WORK. Data race condition in EigenMetod?
Matrix<complex<double> >* Z = new Matrix<complex<double> >[nMat];
double** W = new double*[nMat];
for (size_t k = 0; k < nMat; ++k) {
Z[k].initialize(dim,dim);
W[k] = new double[dim];
}

size_t j;
double start_p1 = omp_get_wtime( );
#pragma omp parallel for shared(Aexp_omp,A,Z,W) private(j) firstprivate(idt)
for (j=0; j< nMat; ++j) {
Aexp_omp[j] = EigenMethod(A[j],-idt,&(Z[j]),W[j]);
}
double end_p1 = omp_get_wtime( );

cout << "Done" << endl;

cout << "Aexp_omp[0] = \n" << Aexp_omp[0] << endl;
cout << "Aexp_omp[1] = \n" << Aexp_omp[1] << endl;

cout << "Serial time: " << end_s - start_s << ", parallel time method 2: " << end_p1-start_p1 << endl;

delete [] c;
delete [] A;
delete [] Aexp;
delete [] Aexp_omp;

} /* end of int main */

Matrix<complex<double> > EigenMethod(const Matrix<complex<double> >& A, complex<double> x, Matrix<complex<double> >* Z, double *W){

bool returnZ(Z!=NULL), returnW(W!=NULL);                            // flags to see if the function received valid Z and W adresses

//if (MOs::IsHermitian(A)==0) UFs::MyError("Routine ExpM::EigenMethod: The input Matrix is not Hermitian.");
size_t N = A.GetRows(), LDA=A.GetLD(), IL, IU, M, LDZ=N;
double VL, VU, ABSTOL=numeric_limits<double>::min();
complex<double>*    WORK;
double*         RWORK;
int*            IWORK;

//finding the optimal work sizes
int LWORK=-1, LRWORK=-1, LIWORK=-1, ok=0;
WORK    = new std::complex<double>[1];
RWORK   = new double[1];
IWORK   = new int[1];

// ZHEEV computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian Matrix A.
// Jobz[0] = V i.e. compute eigenvalues and eigen-vectors, Range[0] = A i.e. all eigen values will be found, UpLo[1] = L i.e. lower triangle of matrix is stored
zheevr_ (&Jobz[0],&Range[0],&UpLo[1],&N,NULL,&LDA,&VL,&VU,&IL,&IU,&ABSTOL,&M,NULL,NULL,&LDZ,NULL,WORK,&LWORK,RWORK,&LRWORK,IWORK,&LIWORK,&ok);

//Setting the optimal workspace
LWORK   = (int)real(WORK[0]);
LRWORK  = (int)RWORK[0];
LIWORK  = IWORK[0];

delete [] WORK;
delete [] RWORK;
delete [] IWORK;

//Running the algorithim
if (!returnZ) Z = new Matrix<std::complex<double> >(LDZ,LDZ);
if (!returnW) W = new double[LDA];

Matrix<std::complex<double> > temp(A);

int *ISUPPZ;
ISUPPZ  = new int[2*N];
WORK    = new std::complex<double>[LWORK];
RWORK   = new double[LRWORK];
IWORK   = new int[LIWORK];
//#pragma omp critical
//{
// THIS LINE IS DOING SOMETHING WRONG
zheevr_ (&Jobz[0],&Range[0],&UpLo[1],&N,temp.GetMat(),&LDA,&VL,&VU,&IL,&IU,&ABSTOL,&M,W,Z->GetMat(),&LDZ,ISUPPZ,WORK,&LWORK,RWORK,&LRWORK,IWORK,&LIWORK,&ok);
//}

if (ok!=0) {
cerr << "Routine EigenMethod: An error occured in geting the diagonalization of A" << endl;
exit(1);
}

//Getting the Diag*Dagger(Z), this is where the exponentiation is done using the eigenvalues
for (size_t i=0; i < N; i++)
for(size_t j=0; j < N; j++)
temp(i,j) = exp(x*W[i])*conj((*Z)(j,i));

temp = (*Z)*temp;

if(!returnW) delete [] W;
if(!returnZ) delete Z;

delete [] ISUPPZ;
delete [] WORK;
delete [] RWORK;
delete [] IWORK;

return temp;
}

Далее идет класс Matrix

#ifndef Matrix_h
#define Matrix_h

enum OutputStyle {Column, List, Array};

template <class T>
class Matrix {
//friend functions get to use the private varibles of the class as well as have different classes as inputs
template <class S>
friend std::ostream& operator << (std::ostream& output, const Matrix<S>& A){
for (size_t i=0; i<A.rows_; i++){
for (size_t j=0; j<A.cols_; j++){
output << A.mat_[j*A.rows_ +i] << "\t";
}
output << std::endl;
}
return output;
}

// overloads beta*A
template <class S1, class S2>
friend Matrix<S1> operator*(const S2& beta, const Matrix<S1>& A){
size_t rows= A.rows_, cols = A.cols_;
Matrix<S1> temp(rows,cols);
for (size_t i=0; i < rows; i++)
for(size_t j=0; j < cols; j++)
temp(i,j) = beta*A(i,j);
return temp;
}friend Matrix<std::complex<double> > operator*(const Matrix<std::complex<double> >& A, const Matrix<std::complex<double> >& B) {
static Matrix<std::complex<double> > C;
C.initialize(A.rows_,B.cols_);
std::complex<double> alpha =1.0, beta =0.0;
zgemm_(&Trans[0], &Trans[0], &A.rows_, &B.cols_, &A.cols_, &alpha, A.mat_, &A.LD_, B.mat_, &B.LD_, &beta, C.mat_, &C.LD_);
return C;
}public:
//Construtors
Matrix() : rows_(0), cols_(0), size_(0), LD_(0), outputstyle_(Array){ mat_=0;}
Matrix(size_t rows, size_t cols) : rows_(rows), cols_(cols), size_(rows*cols), LD_(rows), outputstyle_(Array), mat_(new T [size_]){}

Matrix(const Matrix<T>& rhs) : rows_(rhs.rows_), cols_(rhs.cols_), size_(rhs.size_), LD_(rows_), outputstyle_(rhs.outputstyle_), mat_(new T [size_]) {
for(size_t p = 0; p < size_; p++)
mat_[p] = rhs.mat_[p];
}

//Initialize an empty Matrix() to Matrix(size_t  rows, size_t cols)
inline void initialize(size_t  rows, size_t cols) {
if (rows_ != rows || cols_ != cols ) {
if (mat_ != 0) delete [] (mat_);
rows_   = rows;
cols_   = cols;
size_   = rows_*cols_;
LD_ = rows;
mat_    = new T[size_];
}
}

//Destructor
virtual ~Matrix()   {if (mat_ != 0) delete[] (mat_);}

//Assignment operator
inline Matrix<T>& operator=(const Matrix<T>& rhs){
if (rows_ != rhs.rows_ || cols_ != rhs.cols_ ) {
if (mat_ != 0) delete [] (mat_);
rows_=rhs.rows_;
cols_=rhs.cols_;
size_=rows_*cols_;
LD_=rhs.LD_;
mat_= new T[size_];
}
for (size_t p=0; p < size_; p++)
mat_[p] = T(rhs.mat_[p]);
return *this;
}

inline T& operator()(size_t i, size_t j){
if (i >= rows_ || j >= cols_) exit(1);
return mat_[j*rows_+i];
}

inline T operator()(size_t i, size_t j) const{
if (i >= rows_ || j >= cols_) exit(1);
return mat_[j*rows_+i];
}//overloading functions.
inline Matrix<T> operator+(const Matrix<T>& A){
if(rows_ != A.rows_ || cols_ != A.cols_)
exit(1);
Matrix<T> temp(rows_,cols_);
for (unsigned int p=0; p < size_; p++)
temp.mat_[p] = mat_[p] + A.mat_[p];
return temp;
}

//Member Functions
inline size_t GetColumns() const            { return cols_; }
inline size_t GetRows() const               { return rows_; }
inline size_t GetLD() const             { return LD_;   }
inline size_t size() const              { return size_; }
T* GetMat() const                   { return mat_;  }

protected:
size_t rows_, cols_, size_, LD_;
//rows_ and cols_ are the rows and columns of the Matrix
//size_ = rows*colums dimensions of the vector representation
//LD is the leading dimeonsion and for Column major order is in general eqaul to rows
enum OutputStyle outputstyle_;

T * mat_;

};
#endif /* Matrix_h */

2

c++lapack openmp

Решение

Другие решения

Других решений пока нет …

Источник

Accepted Answer

вы на самом деле не даете достаточно кода (или фактических симптомов), чтобы отладить это для вас. Без этого я могу только догадываться, что реализация zheerv_() использует глобальные переменные (прямо или косвенно).

Помимо запахов кода, о которых я уже упоминал в комментариях, существует проблема с переменной idt: функция EigenMethod() принимает это по ссылке, но вы передаете временный (-idt) — Я удивлен, что скомпилирован. Я думаю, что вы не измените его значение (x) внутри EigenMethod(), так что вы должны либо взять постоянные ссылки, либо (предпочтительно) просто значение.

Кстати, нет необходимости объявлять общие переменные в параллельном цикле, просто сказать

 #pragma omp parallel for
for(int j=0; j<nMat ;++j) { /* ... */ }

и все переменные, объявленные вне цикла, являются общими по умолчанию, и переменная цикла никогда не должна быть объявлена вообще (в любом случае приватный выбор не является правильным выбором).

0