Skip to content

Commit 01b2a22

Browse files
committed
fixing parSum
1 parent 1b8cdc5 commit 01b2a22

9 files changed

Lines changed: 93 additions & 116 deletions

File tree

include/evolution.h

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,6 @@
4646
* arg2 = String data for name of function called. Prints value to stdout.
4747
*/
4848

49-
// UPDATE LIST LATER
50-
/**
51-
* @brief performs real or imaginary time evolution
52-
* @ingroup data
53-
* @param result Function result code of CUDA operation
54-
* @param c Descriptor of CUDA operation
55-
* @return 0 for success. See CUDA failure codes in cuda.h for other values.
56-
*/
57-
void evolve_2d(Grid &par,
58-
cufftDoubleComplex *gpuParSum, int numSteps,
59-
unsigned int gstate,
60-
std::string buffer);
61-
6249
// UPDATE LIST LATER
6350
/**
6451
* @brief performs real or imaginary time evolution
@@ -68,7 +55,7 @@ void evolve_2d(Grid &par,
6855
* @return 0 for success. See CUDA failure codes in cuda.h for other values
6956
*/
7057
void evolve(Grid &par,
71-
cufftDoubleComplex *gpuParSum, int numSteps,
58+
int numSteps,
7259
unsigned int gstate,
7360
std::string buffer);
7461

include/kernels.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ __global__ void scalarDiv2D(double2*, double2*);
211211
* @param dr Smallest area element of grid (dx*dy)
212212
* @param pSum GPU array used to store intermediate results during parallel summation
213213
*/
214-
__global__ void scalarDiv_wfcNorm(double2* in, double dr, double2* pSum, double2* out);
214+
__global__ void scalarDiv_wfcNorm(double2* in, double dr, double* pSum, double2* out);
215215

216216
//##############################################################################
217217

@@ -238,7 +238,7 @@ __global__ void thread_test(double* input, double* output);
238238
* @param pass Number of passes performed by routine
239239
*/
240240
__global__ void multipass(double2* input, double2* output, int pass);
241-
__global__ void multipass(double* input, double* output, int pass);
241+
__global__ void multipass(double* input, double* output);
242242

243243
//##############################################################################
244244

include/split_op.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ int isError(int result, char* c); //Checks to see if an error has occurred.
6363
* @param threads Number of CUDA threads for operation
6464
* @return 0 for success. See CUDA failure codes in cuda.h for other values.
6565
*/
66-
void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par);
67-
void parSum(double* gpuWfc, double* gpuParSum, Grid &par);
66+
void parSum(double2* gpuWfc, Grid &par);
67+
void parSum(double* gpuWfc, double *gpuParSum, Grid &par);
6868

6969
/**
7070
* @brief Creates the optical lattice to match the vortex lattice constant

src/evolution.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
#include "../include/vortex_3d.h"
33

44
void evolve(Grid &par,
5-
double2* gpuParSum, int numSteps,
6-
unsigned int gstate,
7-
std::string buffer){
5+
int numSteps,
6+
unsigned int gstate,
7+
std::string buffer){
88

99
// Re-establishing variables from parsed Grid class
1010
std::string data_dir = par.sval("data_dir");
@@ -875,7 +875,7 @@ void evolve(Grid &par,
875875
}
876876

877877
if(gstate==0){
878-
parSum(gpuWfc, gpuParSum, par);
878+
parSum(gpuWfc, par);
879879
}
880880
}
881881

src/init.cu

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ int init(Grid &par){
4444
cufftDoubleComplex *EV_opt;
4545
cufftDoubleComplex *wfc_backup;
4646
cufftDoubleComplex *EappliedField;
47-
double2 *par_sum;
48-
cudaMalloc((void**) &par_sum, sizeof(double2)*gSize);
4947

5048
std::cout << "gSize is: " << gSize << '\n';
5149
cufftResult result;
@@ -318,7 +316,6 @@ int init(Grid &par){
318316
par.store("V_opt", V_opt);
319317
par.store("wfc_backup", wfc_backup);
320318
par.store("EappliedField", EappliedField);
321-
par.store("par_sum", par_sum);
322319

323320
par.store("result", result);
324321
par.store("plan_1d", plan_1d);
@@ -464,24 +461,26 @@ void set_variables(Grid &par, bool ev_type){
464461
cufftDoubleComplex *EpAx = par.cufftDoubleComplexval("EpAx");
465462
cufftDoubleComplex *EpAy = nullptr;
466463
cufftDoubleComplex *EpAz = nullptr;
467-
if(!par.bval("Ax_time")){
468-
err=cudaMemcpy(pAx_gpu, EpAx, sizeof(cufftDoubleComplex)*gsize,
464+
if (!par.bval("K_time")){
465+
err=cudaMemcpy(K_gpu, EK, sizeof(cufftDoubleComplex)*gsize,
469466
cudaMemcpyHostToDevice);
470467
if(err!=cudaSuccess){
471-
std::cout << "ERROR: Could not copy pAx_gpu to device" << '\n';
468+
std::cout << "ERROR: Could not copy K_gpu to device" << '\n';
472469
exit(1);
473470
}
474-
par.store("pAx_gpu", pAx_gpu);
471+
par.store("K_gpu", K_gpu);
475472
}
476-
if (!par.bval("K_time")){
477-
err=cudaMemcpy(K_gpu, EK, sizeof(cufftDoubleComplex)*gsize,
473+
if(!par.bval("Ax_time")){
474+
err=cudaMemcpy(pAx_gpu, EpAx, sizeof(cufftDoubleComplex)*gsize,
478475
cudaMemcpyHostToDevice);
479476
if(err!=cudaSuccess){
480-
std::cout << "ERROR: Could not copy K_gpu to device" << '\n';
477+
std::cout << "ERROR: Could not copy pAx_gpu to device" << '\n';
478+
std::cout << err << '\n';
481479
exit(1);
482480
}
483-
par.store("K_gpu", K_gpu);
481+
par.store("pAx_gpu", pAx_gpu);
484482
}
483+
485484
if (!par.bval("V_time")){
486485
err=cudaMemcpy(V_gpu, EV, sizeof(cufftDoubleComplex)*gsize,
487486
cudaMemcpyHostToDevice);
@@ -574,7 +573,6 @@ int main(int argc, char **argv){
574573

575574
init(par);
576575

577-
cufftDoubleComplex *par_sum = par.cufftDoubleComplexval("par_sum");
578576
int gsteps = par.ival("gsteps");
579577
int esteps = par.ival("esteps");
580578
std::string data_dir = par.sval("data_dir");
@@ -588,13 +586,13 @@ int main(int argc, char **argv){
588586
std::cout << "Imaginary-time evolution started..." << '\n';
589587
set_variables(par, 0);
590588

591-
evolve(par, par_sum, gsteps, 0, buffer);
589+
evolve(par, gsteps, 0, buffer);
592590
}
593591

594592
if(esteps > 0){
595593
std::cout << "real-time evolution started..." << '\n';
596594
set_variables(par, 1);
597-
evolve(par, par_sum, esteps, 1, buffer);
595+
evolve(par, esteps, 1, buffer);
598596
}
599597

600598
std::cout << "done evolving" << '\n';

src/kernels.cu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,10 +355,10 @@ __global__ void scalarMult(double2* in, double factor, double2* out){
355355
/**
356356
* As above, but normalises for wfc
357357
*/
358-
__global__ void scalarDiv_wfcNorm(double2* in, double dr, double2* pSum, double2* out){
358+
__global__ void scalarDiv_wfcNorm(double2* in, double dr, double* pSum, double2* out){
359359
unsigned int gid = getGid3d3d();
360360
double2 result;
361-
double norm = sqrt((pSum[0].x + pSum[0].y)*dr);
361+
double norm = sqrt((pSum[0])*dr);
362362
result.x = (in[gid].x/norm);
363363
result.y = (in[gid].y/norm);
364364
out[gid] = result;
@@ -439,7 +439,7 @@ __global__ void multipass(double2* input, double2* output, int pass){
439439
/**
440440
* Routine for parallel summation. Can be looped over from host.
441441
*/
442-
__global__ void multipass(double* input, double* output, int pass){
442+
__global__ void multipass(double* input, double* output){
443443
unsigned int tid = threadIdx.x + threadIdx.y*blockDim.x
444444
+ threadIdx.z * blockDim.x * blockDim.y;
445445
unsigned int bid = blockIdx.x + blockIdx.y * gridDim.x
@@ -453,6 +453,7 @@ __global__ void multipass(double* input, double* output, int pass){
453453
extern __shared__ double sdatad[];
454454
sdatad[tid] = input[gid];
455455
__syncthreads();
456+
456457
for(int i = blockDim.x>>1; i > 0; i>>=1){
457458
if(tid < i){
458459
sdatad[tid] += sdatad[tid + i];

src/split_op.cu

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -79,30 +79,26 @@ void parSum(double* gpuWfc, double* gpuParSum, Grid &par){
7979
dim3 thread_tmp = threads;
8080
int pass = 0;
8181

82+
set_eq<<<par.grid, par.threads>>>(gpuWfc, gpuParSum);
83+
8284
dim3 grid = par.grid;
8385
while((double)grid_tmp.x/threads.x > 1.0){
84-
if(pass == 0){
85-
multipass<<<block,threads,threads.x*sizeof(double2)>>>(&gpuWfc[0],
86-
&gpuParSum[0],pass);
87-
}
88-
else{
89-
multipass<<<block,thread_tmp,thread_tmp.x*sizeof(double2)>>>(
90-
&gpuParSum[0],&gpuParSum[0],pass);
91-
}
86+
multipass<<<block,thread_tmp,thread_tmp.x*sizeof(double)>>>(
87+
&gpuParSum[0],&gpuParSum[0]);
9288
grid_tmp.x /= threads.x;
9389
block = (int) ceil((double)grid_tmp.x/threads.x);
9490
pass++;
9591
//std::cout << grid_tmp.x << '\n';
9692
}
9793
thread_tmp = grid_tmp.x;
9894
multipass<<<1,thread_tmp,thread_tmp.x*sizeof(double2)>>>(&gpuParSum[0],
99-
&gpuParSum[0], pass);
95+
&gpuParSum[0]);
10096
}
10197

10298
/*
10399
* Used to perform parallel summation on WFC for normalisation.
104100
*/
105-
void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
101+
void parSum(double2* gpuWfc, Grid &par){
106102
// May need to add double l
107103
int dimnum = par.ival("dimnum");
108104
double dx = par.dval("dx");
@@ -131,8 +127,8 @@ void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
131127
dim3 thread_tmp = threads;
132128
int pass = 0;
133129

134-
cufftDoubleComplex *density;
135-
cudaMalloc((void**) &density, sizeof(double2)*gsize);
130+
double *density;
131+
cudaMalloc((void**) &density, sizeof(double)*gsize);
136132

137133
complexMagnitudeSquared<<<par.grid, par.threads>>>(gpuWfc, density);
138134

@@ -144,32 +140,26 @@ void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
144140
*/
145141
dim3 grid = par.grid;
146142
while((double)grid_tmp.x/threads.x > 1.0){
147-
if(pass == 0){
148-
multipass<<<block,threads,threads.x*sizeof(double2)>>>(&density[0],
149-
&gpuParSum[0],pass);
150-
}
151-
else{
152-
multipass<<<block,thread_tmp,thread_tmp.x*sizeof(double2)>>>(
153-
&gpuParSum[0],&gpuParSum[0],pass);
154-
}
143+
multipass<<<block,threads,threads.x*sizeof(double)>>>(&density[0],
144+
&density[0]);
155145
grid_tmp.x /= threads.x;
156146
block = (int) ceil((double)grid_tmp.x/threads.x);
157147
pass++;
158-
//std::cout << grid_tmp.x << '\n';
148+
//std::cout << pass << '\t' << grid_tmp.x << '\n';
159149
}
160150
thread_tmp = grid_tmp.x;
161-
multipass<<<1,thread_tmp,thread_tmp.x*sizeof(double2)>>>(&gpuParSum[0],
162-
&gpuParSum[0], pass);
151+
multipass<<<1,thread_tmp,thread_tmp.x*sizeof(double)>>>(&density[0],
152+
&density[0]);
163153

164-
// Writing out in the parSum Function (not recommended, for debugging)
165154
/*
166-
double2 *sum;
167-
sum = (cufftDoubleComplex *) malloc(sizeof(cufftDoubleComplex)*gsize / threads.x);
168-
cudaMemcpy(sum,gpuParSum,sizeof(cufftDoubleComplex)*gsize/threads.x,
155+
// Writing out in the parSum Function (not recommended, for debugging)
156+
double *sum;
157+
sum = (double *) malloc(sizeof(double)*gsize);
158+
cudaMemcpy(sum,density,sizeof(double)*gsize,
169159
cudaMemcpyDeviceToHost);
170-
std::cout << sqrt((sum[0].x + sum[0].y)*dg) << '\n';
160+
std::cout << (sum[0]) << '\n';
171161
*/
172-
scalarDiv_wfcNorm<<<grid,threads>>>(gpuWfc, dg, gpuParSum, gpuWfc);
162+
scalarDiv_wfcNorm<<<par.grid,par.threads>>>(gpuWfc, dg, density, gpuWfc);
173163

174164
cudaFree(density);
175165
}
@@ -304,7 +294,6 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
304294

305295
cudaMalloc((void**) &energy_gpu, sizeof(double2)*gSize);
306296
cudaMalloc((void**) &tmp_wfc, sizeof(double2)*gSize);
307-
cudaMalloc((void**) &op, sizeof(double2)*gSize);
308297

309298

310299
for (int i=0; i < gSize; ++i){
@@ -327,8 +316,10 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
327316
energyCalc<<<grid,threads>>>(tmp_wfc, op, dt, energy_gpu, gState,op_space,
328317
0.5*sqrt(omegaZ/mass), gDenConst);
329318
result = cufftExecZ2Z( plan, energy_gpu, energy_gpu, CUFFT_INVERSE );
319+
result = cufftExecZ2Z( plan, tmp_wfc, tmp_wfc, CUFFT_INVERSE );
330320

331321
scalarMult<<<grid,threads>>>(energy_gpu, renorm_factor, energy_gpu);
322+
scalarMult<<<grid,threads>>>(tmp_wfc, renorm_factor, tmp_wfc);
332323

333324
if (corotating){
334325
op_space = 0;
@@ -339,7 +330,7 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
339330

340331
op = par.cufftDoubleComplexval("V_gpu");
341332

342-
energyCalc<<<grid,threads>>>(gpuWfc, op, dt, energy_gpu, gState,op_space,
333+
energyCalc<<<grid,threads>>>(tmp_wfc, op, dt, energy_gpu, gState,op_space,
343334
0.5*sqrt(omegaZ/mass), gDenConst);
344335

345336
err=cudaMemcpy(energy, energy_gpu,
@@ -356,7 +347,6 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
356347

357348
cudaFree(energy_gpu);
358349
cudaFree(tmp_wfc);
359-
cudaFree(op);
360350
free(energy);
361351
return out*dx*dy*dz;
362352

0 commit comments

Comments
 (0)