Skip to content

Commit 7f36c83

Browse files
authored
Merge pull request #2039 from su2code/hybrid_parallel_ad_perf
Hybrid Parallel AD Performance Improvements
2 parents 0e12033 + e593fb6 commit 7f36c83

10 files changed

Lines changed: 145 additions & 56 deletions

File tree

Common/include/basic_types/ad_structure.hpp

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ inline void RegisterInput(su2double& data, bool push_index = true) {}
7474
*/
7575
inline void RegisterOutput(su2double& data) {}
7676

77+
/*!
78+
* \brief Resize the adjoint vector, for subsequent access without bounds checking.
79+
*/
80+
inline void ResizeAdjoints() {}
81+
7782
/*!
7883
* \brief Sets the adjoint value at index to val
7984
* \param[in] index - Position in the adjoint vector.
@@ -369,11 +374,24 @@ FORCEINLINE void Reset() {
369374
}
370375
}
371376

377+
FORCEINLINE void ResizeAdjoints() { AD::getTape().resizeAdjointVector(); }
378+
372379
FORCEINLINE void SetIndex(int& index, const su2double& data) { index = data.getIdentifier(); }
373380

374-
FORCEINLINE void SetDerivative(int index, const double val) { AD::getTape().setGradient(index, val); }
381+
// WARNING: For performance reasons, this method does not perform bounds checking.
382+
// When using it, please ensure sufficient adjoint vector size by a call to AD::ResizeAdjoints().
383+
FORCEINLINE void SetDerivative(int index, const double val) {
384+
using BoundsChecking = codi::GradientAccessTapeInterface<su2double::Gradient, su2double::Identifier>::BoundsChecking;
385+
AD::getTape().setGradient(index, val, BoundsChecking::False);
386+
}
375387

376-
FORCEINLINE double GetDerivative(int index) { return AD::getTape().getGradient(index); }
388+
// WARNING: For performance reasons, this method does not perform bounds checking.
389+
// If called after tape evaluations, the adjoints should exist.
390+
// Otherwise, please ensure sufficient adjoint vector size by a call to AD::ResizeAdjoints().
391+
FORCEINLINE double GetDerivative(int index) {
392+
using BoundsChecking = codi::GradientAccessTapeInterface<su2double::Gradient, su2double::Identifier>::BoundsChecking;
393+
return AD::getTape().getGradient(index, BoundsChecking::False);
394+
}
377395

378396
FORCEINLINE bool IsIdentifierActive(su2double const& value) {
379397
return getTape().isIdentifierActive(value.getIdentifier());
@@ -523,26 +541,14 @@ FORCEINLINE void delete_handler(void* handler) {
523541

524542
FORCEINLINE bool BeginPassive() {
525543
if (AD::getTape().isActive()) {
526-
StopRecording();
544+
AD::getTape().setPassive();
527545
return true;
528546
}
529547
return false;
530548
}
531549

532550
FORCEINLINE void EndPassive(bool wasActive) {
533-
if (wasActive) StartRecording();
534-
}
535-
536-
FORCEINLINE bool PausePreaccumulation() {
537-
const auto current = PreaccEnabled;
538-
if (!current) return false;
539-
SU2_OMP_SAFE_GLOBAL_ACCESS(PreaccEnabled = false;)
540-
return true;
541-
}
542-
543-
FORCEINLINE void ResumePreaccumulation(bool wasActive) {
544-
if (!wasActive) return;
545-
SU2_OMP_SAFE_GLOBAL_ACCESS(PreaccEnabled = true;)
551+
if (wasActive) AD::getTape().setActive();
546552
}
547553

548554
FORCEINLINE void StartNoSharedReading() {
@@ -558,6 +564,19 @@ FORCEINLINE void EndNoSharedReading() {
558564
opdi::logic->addReverseBarrier();
559565
#endif
560566
}
567+
568+
FORCEINLINE bool PausePreaccumulation() {
569+
const auto current = PreaccEnabled;
570+
if (!current) return false;
571+
SU2_OMP_SAFE_GLOBAL_ACCESS(PreaccEnabled = false;)
572+
return true;
573+
}
574+
575+
FORCEINLINE void ResumePreaccumulation(bool wasActive) {
576+
if (!wasActive) return;
577+
SU2_OMP_SAFE_GLOBAL_ACCESS(PreaccEnabled = true;)
578+
}
579+
561580
#endif // CODI_REVERSE_TYPE
562581

563582
void Initialize();

Common/include/code_config.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ FORCEINLINE Out su2staticcast_p(In ptr) {
9696
#include "codi/tools/data/externalFunctionUserData.hpp"
9797

9898
#if defined(HAVE_OMP)
99-
using su2double = codi::RealReverseIndexOpenMP;
99+
using su2double = codi::RealReverseIndexOpenMPGen<double, double>;
100100
#else
101101
#if defined(CODI_INDEX_TAPE)
102102
using su2double = codi::RealReverseIndex;

Common/include/parallelization/omp_structure.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,14 @@ void omp_finalize();
188188
* thread, with all threads and memory views synchronized both beforehand and afterwards.
189189
*/
190190

191-
#define BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS \
192-
SU2_OMP_BARRIER \
191+
#define BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS \
192+
SU2_OMP_BARRIER \
193+
if (omp_in_parallel()) AD::StartNoSharedReading(); \
193194
SU2_OMP_MASTER
194195

195-
#define END_SU2_OMP_SAFE_GLOBAL_ACCESS \
196-
END_SU2_OMP_MASTER \
196+
#define END_SU2_OMP_SAFE_GLOBAL_ACCESS \
197+
END_SU2_OMP_MASTER \
198+
if (omp_in_parallel()) AD::EndNoSharedReading(); \
197199
SU2_OMP_BARRIER
198200

199201
#define SU2_OMP_SAFE_GLOBAL_ACCESS(...) BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS{__VA_ARGS__} END_SU2_OMP_SAFE_GLOBAL_ACCESS

Common/src/linear_algebra/CSysSolve.cpp

Lines changed: 94 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType>&
192192
const CPreconditioner<ScalarType>& precond, ScalarType tol,
193193
unsigned long m, ScalarType& residual, bool monitoring,
194194
const CConfig* config) const {
195-
const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
195+
const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
196196
ScalarType norm_r = 0.0, norm0 = 0.0;
197197
unsigned long i = 0;
198198

@@ -241,16 +241,22 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType>&
241241
if (tol_type == LinearToleranceType::RELATIVE) norm0 = norm_r;
242242

243243
if ((norm_r < tol * norm0) || (norm_r < eps)) {
244-
if (master && (lin_sol_mode != LINEAR_SOLVER_MODE::MESH_DEFORM))
244+
if (masterRank && (lin_sol_mode != LINEAR_SOLVER_MODE::MESH_DEFORM)) {
245+
SU2_OMP_MASTER
245246
cout << "CSysSolve::ConjugateGradient(): system solved by initial guess." << endl;
247+
END_SU2_OMP_MASTER
248+
}
246249
return 0;
247250
}
248251

249252
/*--- Output header information including initial residual ---*/
250253

251-
if (monitoring && master) {
252-
WriteHeader("CG", tol, norm_r);
253-
WriteHistory(i, norm_r / norm0);
254+
if (monitoring && masterRank) {
255+
SU2_OMP_MASTER {
256+
WriteHeader("CG", tol, norm_r);
257+
WriteHistory(i, norm_r / norm0);
258+
}
259+
END_SU2_OMP_MASTER
254260
}
255261
}
256262

@@ -281,7 +287,11 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType>&
281287

282288
norm_r = r.norm();
283289
if (norm_r < tol * norm0) break;
284-
if (((monitoring) && (master)) && ((i + 1) % monitorFreq == 0)) WriteHistory(i + 1, norm_r / norm0);
290+
if (((monitoring) && (masterRank)) && ((i + 1) % monitorFreq == 0)) {
291+
SU2_OMP_MASTER
292+
WriteHistory(i + 1, norm_r / norm0);
293+
END_SU2_OMP_MASTER
294+
}
285295
}
286296

287297
precond(r, z);
@@ -300,16 +310,22 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType>&
300310
/*--- Recalculate final residual (this should be optional) ---*/
301311

302312
if ((monitoring) && (config->GetComm_Level() == COMM_FULL)) {
303-
if (master) WriteFinalResidual("CG", i, norm_r / norm0);
313+
if (masterRank) {
314+
SU2_OMP_MASTER
315+
WriteFinalResidual("CG", i, norm_r / norm0);
316+
END_SU2_OMP_MASTER
317+
}
304318

305319
if (recomputeRes) {
306320
mat_vec(x, A_x);
307321
r = b - A_x;
308322
ScalarType true_res = r.norm();
309323

310324
if (fabs(true_res - norm_r) > tol * 10.0) {
311-
if (master) {
325+
if (masterRank) {
326+
SU2_OMP_MASTER
312327
WriteWarning(norm_r, true_res, tol);
328+
END_SU2_OMP_MASTER
313329
}
314330
}
315331
}
@@ -325,7 +341,7 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
325341
const CPreconditioner<ScalarType>& precond, ScalarType tol,
326342
unsigned long m, ScalarType& residual, bool monitoring,
327343
const CConfig* config) const {
328-
const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
344+
const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
329345
const bool flexible = !precond.IsIdentity();
330346

331347
/*--- Check the subspace size ---*/
@@ -386,7 +402,11 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
386402
if ((beta < tol * norm0) || (beta < eps)) {
387403
/*--- System is already solved ---*/
388404

389-
if (master) cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
405+
if (masterRank) {
406+
SU2_OMP_MASTER
407+
cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
408+
END_SU2_OMP_MASTER
409+
}
390410
residual = beta;
391411
return 0;
392412
}
@@ -403,9 +423,12 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
403423
/*--- Output header information including initial residual ---*/
404424

405425
unsigned long i = 0;
406-
if ((monitoring) && (master)) {
407-
WriteHeader("FGMRES", tol, beta);
408-
WriteHistory(i, beta / norm0);
426+
if ((monitoring) && (masterRank)) {
427+
SU2_OMP_MASTER {
428+
WriteHeader("FGMRES", tol, beta);
429+
WriteHistory(i, beta / norm0);
430+
}
431+
END_SU2_OMP_MASTER
409432
}
410433

411434
/*--- Loop over all search directions ---*/
@@ -444,7 +467,11 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
444467

445468
/*--- Output the relative residual if necessary ---*/
446469

447-
if ((((monitoring) && (master)) && ((i + 1) % monitorFreq == 0)) && (master)) WriteHistory(i + 1, beta / norm0);
470+
if ((((monitoring) && (masterRank)) && ((i + 1) % monitorFreq == 0))) {
471+
SU2_OMP_MASTER
472+
WriteHistory(i + 1, beta / norm0);
473+
END_SU2_OMP_MASTER
474+
}
448475
}
449476

450477
/*--- Solve the least-squares system and update solution ---*/
@@ -460,16 +487,22 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
460487
/*--- Recalculate final (neg.) residual (this should be optional) ---*/
461488

462489
if ((monitoring) && (config->GetComm_Level() == COMM_FULL)) {
463-
if (master) WriteFinalResidual("FGMRES", i, beta / norm0);
490+
if (masterRank) {
491+
SU2_OMP_MASTER
492+
WriteFinalResidual("FGMRES", i, beta / norm0);
493+
END_SU2_OMP_MASTER
494+
}
464495

465496
if (recomputeRes) {
466497
mat_vec(x, W[0]);
467498
W[0] -= b;
468499
ScalarType res = W[0].norm();
469500

470501
if (fabs(res - beta) > tol * 10) {
471-
if (master) {
502+
if (masterRank) {
503+
SU2_OMP_MASTER
472504
WriteWarning(beta, res, tol);
505+
END_SU2_OMP_MASTER
473506
}
474507
}
475508
}
@@ -511,7 +544,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
511544
const CPreconditioner<ScalarType>& precond, ScalarType tol,
512545
unsigned long m, ScalarType& residual, bool monitoring,
513546
const CConfig* config) const {
514-
const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
547+
const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
515548
ScalarType norm_r = 0.0, norm0 = 0.0;
516549
unsigned long i = 0;
517550

@@ -561,15 +594,22 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
561594
if (tol_type == LinearToleranceType::RELATIVE) norm0 = norm_r;
562595

563596
if ((norm_r < tol * norm0) || (norm_r < eps)) {
564-
if (master) cout << "CSysSolve::BCGSTAB(): system solved by initial guess." << endl;
597+
if (masterRank) {
598+
SU2_OMP_MASTER
599+
cout << "CSysSolve::BCGSTAB(): system solved by initial guess." << endl;
600+
END_SU2_OMP_MASTER
601+
}
565602
return 0;
566603
}
567604

568605
/*--- Output header information including initial residual ---*/
569606

570-
if ((monitoring) && (master)) {
571-
WriteHeader("BCGSTAB", tol, norm_r);
572-
WriteHistory(i, norm_r / norm0);
607+
if ((monitoring) && (masterRank)) {
608+
SU2_OMP_MASTER {
609+
WriteHeader("BCGSTAB", tol, norm_r);
610+
WriteHistory(i, norm_r / norm0);
611+
}
612+
END_SU2_OMP_MASTER
573613
}
574614
}
575615

@@ -637,22 +677,32 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
637677

638678
norm_r = r.norm();
639679
if (norm_r < tol * norm0) break;
640-
if (((monitoring) && (master)) && ((i + 1) % monitorFreq == 0) && (master)) WriteHistory(i + 1, norm_r / norm0);
680+
if (((monitoring) && (masterRank)) && ((i + 1) % monitorFreq == 0)) {
681+
SU2_OMP_MASTER
682+
WriteHistory(i + 1, norm_r / norm0);
683+
END_SU2_OMP_MASTER
684+
}
641685
}
642686
}
643687

644688
/*--- Recalculate final residual (this should be optional) ---*/
645689

646690
if ((monitoring) && (config->GetComm_Level() == COMM_FULL)) {
647-
if (master) WriteFinalResidual("BCGSTAB", i, norm_r / norm0);
691+
if (masterRank) {
692+
SU2_OMP_MASTER
693+
WriteFinalResidual("BCGSTAB", i, norm_r / norm0);
694+
END_SU2_OMP_MASTER
695+
}
648696

649697
if (recomputeRes) {
650698
mat_vec(x, A_x);
651699
r = b - A_x;
652700
ScalarType true_res = r.norm();
653701

654-
if ((fabs(true_res - norm_r) > tol * 10.0) && (master)) {
702+
if ((fabs(true_res - norm_r) > tol * 10.0) && (masterRank)) {
703+
SU2_OMP_MASTER
655704
WriteWarning(norm_r, true_res, tol);
705+
END_SU2_OMP_MASTER
656706
}
657707
}
658708
}
@@ -667,7 +717,7 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
667717
const CPreconditioner<ScalarType>& precond, ScalarType tol,
668718
unsigned long m, ScalarType& residual, bool monitoring,
669719
const CConfig* config) const {
670-
const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
720+
const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
671721
const bool fix_iter_mode = tol < eps;
672722
ScalarType norm_r = 0.0, norm0 = 0.0;
673723
unsigned long i = 0;
@@ -717,15 +767,22 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
717767
if (tol_type == LinearToleranceType::RELATIVE) norm0 = norm_r;
718768

719769
if ((norm_r < tol * norm0) || (norm_r < eps)) {
720-
if (master) cout << "CSysSolve::Smoother_LinSolver(): system solved by initial guess." << endl;
770+
if (masterRank) {
771+
SU2_OMP_MASTER
772+
cout << "CSysSolve::Smoother_LinSolver(): system solved by initial guess." << endl;
773+
END_SU2_OMP_MASTER
774+
}
721775
return 0;
722776
}
723777

724778
/*--- Output header information including initial residual. ---*/
725779

726-
if ((monitoring) && (master)) {
727-
WriteHeader("Smoother", tol, norm_r);
728-
WriteHistory(i, norm_r / norm0);
780+
if ((monitoring) && (masterRank)) {
781+
SU2_OMP_MASTER {
782+
WriteHeader("Smoother", tol, norm_r);
783+
WriteHistory(i, norm_r / norm0);
784+
}
785+
END_SU2_OMP_MASTER
729786
}
730787
}
731788

@@ -759,14 +816,20 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
759816
if (!fix_iter_mode && config->GetComm_Level() == COMM_FULL) {
760817
norm_r = r.norm();
761818
if (norm_r < tol * norm0) break;
762-
if (((monitoring) && (master)) && ((i + 1) % monitorFreq == 0)) WriteHistory(i + 1, norm_r / norm0);
819+
if (((monitoring) && (masterRank)) && ((i + 1) % monitorFreq == 0)) {
820+
SU2_OMP_MASTER
821+
WriteHistory(i + 1, norm_r / norm0);
822+
END_SU2_OMP_MASTER
823+
}
763824
}
764825
}
765826

766827
if (fix_iter_mode) norm_r = r.norm();
767828

768-
if ((monitoring) && (master) && (config->GetComm_Level() == COMM_FULL)) {
829+
if ((monitoring) && (masterRank) && (config->GetComm_Level() == COMM_FULL)) {
830+
SU2_OMP_MASTER
769831
WriteFinalResidual("Smoother", i, norm_r / norm0);
832+
END_SU2_OMP_MASTER
770833
}
771834

772835
residual = norm_r / norm0;

0 commit comments

Comments
 (0)