Skip to content

Commit 2c9fbb6

Browse files
authored
Merge pull request #2167 from su2code/adaptive_edge_color_group_size
Adaptive Edge Color Group Size
2 parents 8b89e85 + 339d745 commit 2c9fbb6

9 files changed

Lines changed: 120 additions & 6 deletions

File tree

Common/include/CConfig.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,6 +1173,7 @@ class CConfig {
11731173
string caseName; /*!< \brief Name of the current case */
11741174

11751175
unsigned long edgeColorGroupSize; /*!< \brief Size of the edge groups colored for OpenMP parallelization of edge loops. */
1176+
bool edgeColoringRelaxDiscAdj; /*!< \brief Allow fallback to smaller edge color group sizes and use more colors for the discrete adjoint. */
11761177

11771178
INLET_SPANWISE_INTERP Kind_InletInterpolationFunction; /*!brief type of spanwise interpolation function to use for the inlet face. */
11781179
INLET_INTERP_TYPE Kind_Inlet_InterpolationType; /*!brief type of spanwise interpolation data to use for the inlet face. */
@@ -9660,6 +9661,11 @@ class CConfig {
96609661
*/
96619662
unsigned long GetEdgeColoringGroupSize(void) const { return edgeColorGroupSize; }
96629663

9664+
/*!
9665+
* \brief Check if the discrete adjoint is allowed to relax the coloring, that is, allow smaller edge color group sizes and allow more colors.
9666+
*/
9667+
bool GetEdgeColoringRelaxDiscAdj() const { return edgeColoringRelaxDiscAdj; }
9668+
96639669
/*!
96649670
* \brief Get the ParMETIS load balancing tolerance.
96659671
*/

Common/include/geometry/CGeometry.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1720,10 +1720,14 @@ class CGeometry {
17201720
/*!
17211721
* \brief Get the edge coloring.
17221722
* \note This method computes the coloring if that has not been done yet.
1723+
* \note Can be instructed to determine and use the maximum edge color group size between 1 and
1724+
* CGeometry::edgeColorGroupSize that yields a coloring that is at least as efficient as #COLORING_EFF_THRESH.
17231725
* \param[out] efficiency - optional output of the coloring efficiency.
1726+
* \param[in] maximizeEdgeColorGroupSize - use the maximum edge color group size that gives an efficient coloring.
17241727
* \return Reference to the coloring.
17251728
*/
1726-
const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr);
1729+
const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr,
1730+
bool maximizeEdgeColorGroupSize = false);
17271731

17281732
/*!
17291733
* \brief Force the natural (sequential) edge coloring.

Common/include/toolboxes/graph_toolbox.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ T createNaturalColoring(Index_t numInnerIndexes) {
484484
* \param[out] indexColor - Optional, vector with colors given to the outer indices.
485485
* \return Coloring in the same type of the input pattern.
486486
*/
487-
template <typename Color_t = char, size_t MaxColors = 64, size_t MaxMB = 128, class T>
487+
template <typename Color_t = unsigned char, size_t MaxColors = 255, size_t MaxMB = 128, class T>
488488
T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors = false,
489489
std::vector<Color_t>* indexColor = nullptr) {
490490
static_assert(std::is_integral<Color_t>::value, "");

Common/src/CConfig.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2924,6 +2924,9 @@ void CConfig::SetConfig_Options() {
29242924
/* DESCRIPTION: Size of the edge groups colored for thread parallel edge loops (0 forces the reducer strategy). */
29252925
addUnsignedLongOption("EDGE_COLORING_GROUP_SIZE", edgeColorGroupSize, 512);
29262926

2927+
/* DESCRIPTION: Allow fallback to smaller edge color group sizes for the discrete adjoint and allow more colors. */
2928+
addBoolOption("EDGE_COLORING_RELAX_DISC_ADJ", edgeColoringRelaxDiscAdj, true);
2929+
29272930
/*--- options that are used for libROM ---*/
29282931
/*!\par CONFIG_CATEGORY:libROM options \ingroup Config*/
29292932

Common/src/geometry/CGeometry.cpp

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3609,7 +3609,7 @@ const su2vector<unsigned long>& CGeometry::GetTransposeSparsePatternMap(Connecti
36093609
return pattern.transposePtr();
36103610
}
36113611

3612-
const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficiency) {
3612+
const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficiency, bool maximizeEdgeColorGroupSize) {
36133613
/*--- Check for dry run mode with dummy geometry. ---*/
36143614
if (nEdge == 0) return edgeColoring;
36153615

@@ -3637,7 +3637,60 @@ const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(su2double* efficien
36373637

36383638
/*--- Color the edges. ---*/
36393639
constexpr bool balanceColors = true;
3640-
edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
3640+
3641+
/*--- If requested, find an efficient coloring with maximum color group size (up to edgeColorGroupSize). ---*/
3642+
if (maximizeEdgeColorGroupSize) {
3643+
auto upperEdgeColorGroupSize = edgeColorGroupSize + 1; /* upper bound that is deemed too large */
3644+
auto nextEdgeColorGroupSize = edgeColorGroupSize; /* next value that we are going to try */
3645+
auto lowerEdgeColorGroupSize = 1ul; /* lower bound that is known to work */
3646+
3647+
bool admissibleColoring = false; /* keep track wether the last tested coloring is admissible */
3648+
3649+
while (true) {
3650+
edgeColoring = colorSparsePattern(pattern, nextEdgeColorGroupSize, balanceColors);
3651+
3652+
/*--- If the coloring fails, reduce the color group size. ---*/
3653+
if (edgeColoring.empty()) {
3654+
upperEdgeColorGroupSize = nextEdgeColorGroupSize;
3655+
admissibleColoring = false;
3656+
}
3657+
/*--- If the coloring succeeds, check the efficiency. ---*/
3658+
else {
3659+
const su2double currentEfficiency =
3660+
coloringEfficiency(edgeColoring, omp_get_max_threads(), nextEdgeColorGroupSize);
3661+
3662+
/*--- If the coloring is not efficient, reduce the color group size. ---*/
3663+
if (currentEfficiency < COLORING_EFF_THRESH) {
3664+
upperEdgeColorGroupSize = nextEdgeColorGroupSize;
3665+
admissibleColoring = false;
3666+
}
3667+
/*--- Otherwise, enlarge the color group size. ---*/
3668+
else {
3669+
lowerEdgeColorGroupSize = nextEdgeColorGroupSize;
3670+
admissibleColoring = true;
3671+
}
3672+
}
3673+
3674+
const auto increment = (upperEdgeColorGroupSize - lowerEdgeColorGroupSize) / 2;
3675+
nextEdgeColorGroupSize = lowerEdgeColorGroupSize + increment;
3676+
3677+
/*--- Terminating condition. ---*/
3678+
if (increment == 0) {
3679+
break;
3680+
}
3681+
}
3682+
3683+
edgeColorGroupSize = nextEdgeColorGroupSize;
3684+
3685+
/*--- If the last tested coloring was not admissible, recompute the final coloring. ---*/
3686+
if (!admissibleColoring) {
3687+
edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
3688+
}
3689+
}
3690+
/*--- No adaptivity. ---*/
3691+
else {
3692+
edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize, balanceColors);
3693+
}
36413694

36423695
/*--- If the coloring fails use the natural coloring. This is a
36433696
* "soft" failure as this "bad" coloring should be detected

SU2_CFD/include/solvers/CFVMFlowSolverBase.inl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,16 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
288288
* sum the fluxes for each cell and set the diagonal of the system matrix. ---*/
289289

290290
su2double parallelEff = 1.0;
291+
292+
#ifdef CODI_REVERSE_TYPE
293+
/*--- For the discrete adjoint, the reducer strategy is costly. Prefer coloring, possibly with reduced edge color
294+
* group size. Find the maximum edge color group size that yields an efficient coloring. Also, allow larger numbers
295+
* of colors. ---*/
296+
const bool relax = config.GetEdgeColoringRelaxDiscAdj();
297+
const auto& coloring = geometry.GetEdgeColoring(&parallelEff, relax);
298+
#else
291299
const auto& coloring = geometry.GetEdgeColoring(&parallelEff);
300+
#endif
292301

293302
/*--- The decision to use the strategy is local to each rank. ---*/
294303
ReducerStrategy = parallelEff < COLORING_EFF_THRESH;
@@ -324,6 +333,29 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
324333
<< "\n The memory usage of the discrete adjoint solver is higher when using the fallback."
325334
#endif
326335
<< endl;
336+
} else {
337+
if (SU2_MPI::GetRank() == MASTER_NODE) {
338+
cout << "All ranks use edge coloring." << endl;
339+
}
340+
}
341+
342+
const su2double coloredParallelEff = ReducerStrategy ? 1.0 : parallelEff;
343+
su2double minColoredParallelEff = 1.0;
344+
SU2_MPI::Reduce(&coloredParallelEff, &minColoredParallelEff, 1, MPI_DOUBLE, MPI_MIN, MASTER_NODE, SU2_MPI::GetComm());
345+
346+
const unsigned long coloredNumColors = ReducerStrategy ? 0 : coloring.getOuterSize();
347+
unsigned long maxColoredNumColors = 0;
348+
SU2_MPI::Reduce(&coloredNumColors, &maxColoredNumColors, 1, MPI_UNSIGNED_LONG, MPI_MAX, MASTER_NODE, SU2_MPI::GetComm());
349+
350+
const unsigned long coloredEdgeColorGroupSize = ReducerStrategy ? 1 << 30 : geometry.GetEdgeColorGroupSize();
351+
unsigned long minColoredEdgeColorGroupSize = 1 << 30;
352+
SU2_MPI::Reduce(&coloredEdgeColorGroupSize, &minColoredEdgeColorGroupSize, 1, MPI_UNSIGNED_LONG, MPI_MIN, MASTER_NODE, SU2_MPI::GetComm());
353+
354+
if (SU2_MPI::GetRank() == MASTER_NODE && numRanksUsingReducer != SU2_MPI::GetSize()) {
355+
cout << "Among the ranks that use edge coloring,\n"
356+
<< " the minimum efficiency is " << minColoredParallelEff << ",\n"
357+
<< " the maximum number of colors is " << maxColoredNumColors << ",\n"
358+
<< " the minimum edge color group size is " << minColoredEdgeColorGroupSize << "." << endl;
327359
}
328360
}
329361

SU2_CFD/include/solvers/CScalarSolver.inl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,15 @@ CScalarSolver<VariableType>::CScalarSolver(CGeometry* geometry, CConfig* config,
4646
#ifdef HAVE_OMP
4747
/*--- Get the edge coloring, see notes in CEulerSolver's constructor. ---*/
4848
su2double parallelEff = 1.0;
49+
#ifdef CODI_REVERSE_TYPE
50+
/*--- For the discrete adjoint, the reducer strategy is costly. Prefer coloring, possibly with reduced edge color
51+
* group size. Find the maximum edge color group size that yields an efficient coloring. Also, allow larger numbers
52+
* of colors. ---*/
53+
const bool relax = config->GetEdgeColoringRelaxDiscAdj();
54+
const auto& coloring = geometry->GetEdgeColoring(&parallelEff, relax);
55+
#else
4956
const auto& coloring = geometry->GetEdgeColoring(&parallelEff);
57+
#endif
5058

5159
ReducerStrategy = parallelEff < COLORING_EFF_THRESH;
5260

TestCases/hybrid_regression_AD.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def main():
242242
pywrapper_FEA_AD_FlowLoad.test_vals_aarch64 = [-0.131745, -0.553214, -0.000364, -0.003101]
243243
pywrapper_FEA_AD_FlowLoad.command = TestCase.Command(exec = "python", param = "run_adjoint.py --parallel -f")
244244
pywrapper_FEA_AD_FlowLoad.timeout = 1600
245-
pywrapper_FEA_AD_FlowLoad.tol = 5e-3
245+
pywrapper_FEA_AD_FlowLoad.tol = 1e-2
246246
pywrapper_FEA_AD_FlowLoad.new_output = False
247247
pywrapper_FEA_AD_FlowLoad.enabled_with_tsan = False
248248
test_list.append(pywrapper_FEA_AD_FlowLoad)
@@ -257,7 +257,7 @@ def main():
257257
pywrapper_CFD_AD_MeshDisp.test_vals_aarch64 = [30.000000, -2.516536, 1.386443, 0.000000]
258258
pywrapper_CFD_AD_MeshDisp.command = TestCase.Command(exec = "python", param = "run_adjoint.py --parallel -f")
259259
pywrapper_CFD_AD_MeshDisp.timeout = 1600
260-
pywrapper_CFD_AD_MeshDisp.tol = 1e-3
260+
pywrapper_CFD_AD_MeshDisp.tol = 1e-2
261261
pywrapper_CFD_AD_MeshDisp.new_output = False
262262
pywrapper_CFD_AD_MeshDisp.enabled_with_tsan = False
263263
test_list.append(pywrapper_CFD_AD_MeshDisp)

config_template.cfg

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2140,6 +2140,14 @@ UQ_DELTA_B= 1.0
21402140
% The optimum value/strategy is case-dependent.
21412141
EDGE_COLORING_GROUP_SIZE= 512
21422142
%
2143+
% Coloring tends to perform better for the discrete adjoint than reductions because
2144+
% it uses less memory and enables the shared reading optimization for color loops.
2145+
% This option allows an automatic fallback to smaller edge color group sizes on ranks
2146+
% where the requested edge color group size is not efficient. Specifically, the largest
2147+
% edge color group size up to EDGE_COLORING_GROUP_SIZE is chosen that is at least
2148+
% 0.875 efficient. Also, this option allows using more colors, up to 255 instead of up to 64.
2149+
EDGE_COLORING_RELAX_DISC_ADJ= YES
2150+
%
21432151
% Independent "threads per MPI rank" setting for LU-SGS and ILU preconditioners.
21442152
% For problems where time is spend mostly in the solution of linear systems (e.g. elasticity,
21452153
% very high CFL central schemes), AND, if the memory bandwidth of the machine is saturated

0 commit comments

Comments
 (0)