@@ -79,30 +79,26 @@ void parSum(double* gpuWfc, double* gpuParSum, Grid &par){
7979 dim3 thread_tmp = threads;
8080 int pass = 0 ;
8181
82+ set_eq<<<par.grid, par.threads>>> (gpuWfc, gpuParSum);
83+
8284 dim3 grid = par.grid ;
8385 while ((double )grid_tmp.x /threads.x > 1.0 ){
84- if (pass == 0 ){
85- multipass<<<block,threads,threads.x*sizeof (double2 )>>> (&gpuWfc[0 ],
86- &gpuParSum[0 ],pass);
87- }
88- else {
89- multipass<<<block,thread_tmp,thread_tmp.x*sizeof (double2 )>>> (
90- &gpuParSum[0 ],&gpuParSum[0 ],pass);
91- }
86+ multipass<<<block,thread_tmp,thread_tmp.x*sizeof (double )>>> (
87+ &gpuParSum[0 ],&gpuParSum[0 ]);
9288 grid_tmp.x /= threads.x ;
9389 block = (int ) ceil ((double )grid_tmp.x /threads.x );
9490 pass++;
9591 // std::cout << grid_tmp.x << '\n';
9692 }
9793 thread_tmp = grid_tmp.x ;
9894 multipass<<<1 ,thread_tmp,thread_tmp.x*sizeof (double2 )>>> (&gpuParSum[0 ],
99- &gpuParSum[0 ], pass );
95+ &gpuParSum[0 ]);
10096}
10197
10298/*
10399 * Used to perform parallel summation on WFC for normalisation.
104100 */
105- void parSum (double2 * gpuWfc, double2 * gpuParSum, Grid &par){
101+ void parSum (double2 * gpuWfc, Grid &par){
106102 // May need to add double l
107103 int dimnum = par.ival (" dimnum" );
108104 double dx = par.dval (" dx" );
@@ -131,8 +127,8 @@ void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
131127 dim3 thread_tmp = threads;
132128 int pass = 0 ;
133129
134- cufftDoubleComplex *density;
135- cudaMalloc ((void **) &density, sizeof (double2 )*gsize);
130+ double *density;
131+ cudaMalloc ((void **) &density, sizeof (double )*gsize);
136132
137133 complexMagnitudeSquared<<<par.grid, par.threads>>> (gpuWfc, density);
138134
@@ -144,32 +140,26 @@ void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
144140*/
145141 dim3 grid = par.grid ;
146142 while ((double )grid_tmp.x /threads.x > 1.0 ){
147- if (pass == 0 ){
148- multipass<<<block,threads,threads.x*sizeof (double2 )>>> (&density[0 ],
149- &gpuParSum[0 ],pass);
150- }
151- else {
152- multipass<<<block,thread_tmp,thread_tmp.x*sizeof (double2 )>>> (
153- &gpuParSum[0 ],&gpuParSum[0 ],pass);
154- }
143+ multipass<<<block,threads,threads.x*sizeof (double )>>> (&density[0 ],
144+ &density[0 ]);
155145 grid_tmp.x /= threads.x ;
156146 block = (int ) ceil ((double )grid_tmp.x /threads.x );
157147 pass++;
158- // std::cout << grid_tmp.x << '\n';
148+ // std::cout << pass << '\t' << grid_tmp.x << '\n';
159149 }
160150 thread_tmp = grid_tmp.x ;
161- multipass<<<1 ,thread_tmp,thread_tmp.x*sizeof (double2 )>>> (&gpuParSum [0 ],
162- &gpuParSum [0 ], pass );
151+ multipass<<<1 ,thread_tmp,thread_tmp.x*sizeof (double )>>> (&density [0 ],
152+ &density [0 ]);
163153
164- // Writing out in the parSum Function (not recommended, for debugging)
165154/*
166- double2 *sum;
167- sum = (cufftDoubleComplex *) malloc(sizeof(cufftDoubleComplex)*gsize / threads.x);
168- cudaMemcpy(sum,gpuParSum,sizeof(cufftDoubleComplex)*gsize/threads.x,
155+ // Writing out in the parSum Function (not recommended, for debugging)
156+ double *sum;
157+ sum = (double *) malloc(sizeof(double)*gsize);
158+ cudaMemcpy(sum,density,sizeof(double)*gsize,
169159 cudaMemcpyDeviceToHost);
170- std::cout << sqrt(( sum[0].x + sum[0].y)*dg ) << '\n';
160+ std::cout << ( sum[0]) << '\n';
171161*/
172- scalarDiv_wfcNorm<<<grid,threads>>> (gpuWfc, dg, gpuParSum , gpuWfc);
162+ scalarDiv_wfcNorm<<<par. grid,par. threads>>> (gpuWfc, dg, density , gpuWfc);
173163
174164 cudaFree (density);
175165}
@@ -304,7 +294,6 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
304294
305295 cudaMalloc ((void **) &energy_gpu, sizeof (double2 )*gSize );
306296 cudaMalloc ((void **) &tmp_wfc, sizeof (double2 )*gSize );
307- cudaMalloc ((void **) &op, sizeof (double2 )*gSize );
308297
309298
310299 for (int i=0 ; i < gSize ; ++i){
@@ -327,8 +316,10 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
327316 energyCalc<<<grid,threads>>> (tmp_wfc, op, dt, energy_gpu, gState ,op_space,
328317 0.5 *sqrt (omegaZ/mass), gDenConst );
329318 result = cufftExecZ2Z ( plan, energy_gpu, energy_gpu, CUFFT_INVERSE );
319+ result = cufftExecZ2Z ( plan, tmp_wfc, tmp_wfc, CUFFT_INVERSE );
330320
331321 scalarMult<<<grid,threads>>> (energy_gpu, renorm_factor, energy_gpu);
322+ scalarMult<<<grid,threads>>> (tmp_wfc, renorm_factor, tmp_wfc);
332323
333324 if (corotating){
334325 op_space = 0 ;
@@ -339,7 +330,7 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
339330
340331 op = par.cufftDoubleComplexval (" V_gpu" );
341332
342- energyCalc<<<grid,threads>>> (gpuWfc , op, dt, energy_gpu, gState ,op_space,
333+ energyCalc<<<grid,threads>>> (tmp_wfc , op, dt, energy_gpu, gState ,op_space,
343334 0.5 *sqrt (omegaZ/mass), gDenConst );
344335
345336 err=cudaMemcpy (energy, energy_gpu,
@@ -356,7 +347,6 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
356347
357348 cudaFree (energy_gpu);
358349 cudaFree (tmp_wfc);
359- cudaFree (op);
360350 free (energy);
361351 return out*dx*dy*dz;
362352
0 commit comments