@@ -109,8 +109,19 @@ void CSysSolve<ScalarType>::SolveReduced(int n, const su2matrix<ScalarType>& Hsb
109109}
110110
111111template <class ScalarType >
112- void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,
112+ void CSysSolve<ScalarType>::ModGramSchmidt(bool shared_hsbg, int i, su2matrix<ScalarType>& Hsbg,
113113 vector<CSysVector<ScalarType> >& w) const {
114+ const auto thread = omp_get_thread_num ();
115+
116+ /* --- If Hsbg is shared by multiple threads calling this function, only one
117+ * thread can write into it. If Hsbg is private, all threads need to write. ---*/
118+
119+ auto SetHsbg = [&](int row, int col, const ScalarType& value) {
120+ if (!shared_hsbg || thread == 0 ) {
121+ Hsbg (row, col) = value;
122+ }
123+ };
124+
114125 /* --- Parameter for reorthonormalization ---*/
115126
116127 const ScalarType reorth = 0.98 ;
@@ -132,28 +143,29 @@ void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,
132143
133144 for (int k = 0 ; k < i + 1 ; k++) {
134145 ScalarType prod = w[i + 1 ].dot (w[k]);
135- Hsbg (k, i) = prod;
146+ ScalarType h_ki = prod;
136147 w[i + 1 ] -= prod * w[k];
137148
138149 /* --- Check if reorthogonalization is necessary ---*/
139150
140151 if (prod * prod > thr) {
141152 prod = w[i + 1 ].dot (w[k]);
142- Hsbg (k, i) += prod;
153+ h_ki += prod;
143154 w[i + 1 ] -= prod * w[k];
144155 }
156+ SetHsbg (k, i, h_ki);
145157
146158 /* --- Update the norm and check its size ---*/
147159
148- nrm -= pow (Hsbg (k, i) , 2 );
160+ nrm -= pow (h_ki , 2 );
149161 nrm = max<ScalarType>(nrm, 0.0 );
150162 thr = nrm * reorth;
151163 }
152164
153165 /* --- Test the resulting vector ---*/
154166
155167 nrm = w[i + 1 ].norm ();
156- Hsbg (i + 1 , i) = nrm;
168+ SetHsbg (i + 1 , i, nrm) ;
157169
158170 /* --- Scale the resulting vector ---*/
159171
@@ -343,6 +355,9 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
343355 const CConfig* config) const {
344356 const bool masterRank = (SU2_MPI::GetRank () == MASTER_NODE);
345357 const bool flexible = !precond.IsIdentity ();
358+ /* --- If we call the solver outside of a parallel region, but the number of threads allows,
359+ * we still want to parallelize some of the expensive operations. ---*/
360+ const bool nestedParallel = !omp_in_parallel () && omp_get_max_threads () > 1 ;
346361
347362 /* --- Check the subspace size ---*/
348363
@@ -452,8 +467,8 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
452467
453468 /* --- Modified Gram-Schmidt orthogonalization ---*/
454469
455- SU2_OMP_PARALLEL_ (if (! omp_in_parallel () ))
456- ModGramSchmidt (i, H, W);
470+ SU2_OMP_PARALLEL_ (if (nestedParallel ))
471+ ModGramSchmidt (nestedParallel, i, H, W);
457472 END_SU2_OMP_PARALLEL
458473
459474 /* --- Apply old Givens rotations to new column of the Hessenberg matrix then generate the
@@ -482,7 +497,7 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
482497
483498 const auto & basis = flexible ? Z : W;
484499
485- SU2_OMP_PARALLEL_ (if (! omp_in_parallel () ))
500+ SU2_OMP_PARALLEL_ (if (nestedParallel ))
486501 for (unsigned long k = 0 ; k < i; k++) {
487502 x += y[k] * basis[k];
488503 }
0 commit comments