@@ -53,6 +53,8 @@ class CQuasiNewtonDriver {
5353 static_assert (std::is_floating_point<Scalar>::value," " );
5454
5555private:
56+ using MPI_Wrapper = typename SelectMPIWrapper<Scalar>::W;
57+
5658 enum : size_t {BLOCK_SIZE = 1024 }; /* !< \brief Loop tiling parameter. */
5759 std::vector<su2matrix<Scalar> > X, R; /* !< \brief Input and residual history of the FP. */
5860 su2matrix<Scalar> corr; /* !< \brief Correction to the natural FP result. */
@@ -70,13 +72,11 @@ class CQuasiNewtonDriver {
7072 }
7173
7274 void computeNormalEquations () {
73- /* --- Lower triangular packed storage. ---*/
74- mat.resize (iSample*(iSample+1 )/2 ) = Scalar (0 );
75- rhs.resize (iSample) = Scalar (0 );
76-
7775 /* --- Size for the dot products. ---*/
7876 const auto end = std::min<Index>(nPtDomain,corr.rows ())*corr.cols ();
7977
78+ mat = Scalar (0 ); rhs = Scalar (0 );
79+
8080 /* --- Tiled part of the loop. ---*/
8181 Index begin = 0 ;
8282 while (end-begin >= BLOCK_SIZE) {
@@ -94,13 +94,13 @@ class CQuasiNewtonDriver {
9494 const auto type = (sizeof (Scalar) < sizeof (double ))? MPI_FLOAT : MPI_DOUBLE;
9595
9696 su2vector<Scalar> tmp (mat.size ());
97- SelectMPIWrapper<Scalar>:: W:: Allreduce (mat.data (), tmp.data (), mat. size () ,
98- type, MPI_SUM, MPI_COMM_WORLD);
97+ MPI_Wrapper:: Allreduce (mat.data (), tmp.data (), iSample*(iSample+ 1 )/ 2 ,
98+ type, MPI_SUM, MPI_COMM_WORLD);
9999 mat = std::move (tmp);
100100
101- sol = rhs;
102- SelectMPIWrapper<Scalar>:: W::Allreduce (sol. data (), rhs. data (), rhs. size (),
103- type, MPI_SUM, MPI_COMM_WORLD );
101+ MPI_Wrapper::Allreduce (rhs. data (), sol. data (), iSample,
102+ type, MPI_SUM, MPI_COMM_WORLD);
103+ std::swap (rhs, sol );
104104 }
105105 }
106106
@@ -115,9 +115,11 @@ class CQuasiNewtonDriver {
115115 for (Index i = 0 ; i < iSample; ++i) {
116116 const auto ri1 = R[i+1 ].data () + start;
117117 const auto ri0 = R[i].data () + start;
118- for (Index j = 0 ; j <= i; ++j) {
118+ /* --- Off-diagonal coefficients. ---*/
119+ for (Index j = 0 ; j < i; ++j) {
119120 const auto rj1 = R[j+1 ].data () + start;
120121 const auto rj0 = R[j].data () + start;
122+ /* --- Sum of partial sums to reduce trunc. error. ---*/
121123 Scalar sum = 0 ;
122124 SU2_OMP_SIMD
123125 for (Index k = 0 ; k < blkSize; ++k) {
@@ -126,13 +128,16 @@ class CQuasiNewtonDriver {
126128 const auto iCoeff = i*(i+1 )/2 + j;
127129 mat (iCoeff) += sum;
128130 }
131+ /* --- Diagonal coeff and residual fused. ---*/
132+ Scalar diag = 0 , res = 0 ;
129133 const auto r = R[iSample].data () + start;
130- Scalar sum = 0 ;
131134 SU2_OMP_SIMD
132135 for (Index k = 0 ; k < blkSize; ++k) {
133- sum += (ri1[k]-ri0[k]) * r[k];
136+ diag += pow (ri1[k]-ri0[k], 2 );
137+ res += (ri1[k]-ri0[k]) * r[k];
134138 }
135- vec (i) -= sum;
139+ mat (i*(i+3 )/2 ) += diag;
140+ vec (i) -= res;
136141 }
137142 }
138143
@@ -165,6 +170,10 @@ class CQuasiNewtonDriver {
165170 R.emplace_back (npt,nvar);
166171 }
167172 X[0 ] = Scalar (0 );
173+ /* --- Lower triangular packed storage. ---*/
174+ mat.resize (nsample*(nsample-1 )/2 );
175+ rhs.resize (nsample-1 );
176+ sol.resize (nsample-1 );
168177 }
169178
170179 /* ! \brief Size of the object, the number of samples. */
@@ -211,11 +220,10 @@ class CQuasiNewtonDriver {
211220 for (Index j = 0 ; j <= i; ++j)
212221 pseudoInv (i,j) = mat (k++);
213222 pseudoInv.Invert (true );
214- sol.resize (iSample);
215223 pseudoInv.MatVecMult (rhs.data (), sol.data ());
216224
217225 /* --- Compute correction, cleared before for less trunc. error. ---*/
218- for (Index k = 0 ; k < sol. size () ; ++k) {
226+ for (Index k = 0 ; k < iSample ; ++k) {
219227 const auto x1 = X[k+1 ].data ();
220228 const auto r1 = R[k+1 ].data ();
221229 const auto x0 = X[k].data ();
0 commit comments