fixing parSum

leios · leios · commit 01b2a227cc7c · 2018-07-31T17:31:02.000+09:00
diff --git a/include/evolution.h b/include/evolution.h
@@ -46,19 +46,6 @@
  * arg2 = String data for name of function called. Prints value to stdout.
  */
 
-// UPDATE LIST LATER
- /**
- * @brief	performs real or imaginary time evolution
- * @ingroup	data
- * @param	result Function result code of CUDA operation
- * @param	c Descriptor of CUDA operation
- * @return	0 for success. See CUDA failure codes in cuda.h for other values.
- */
-void evolve_2d(Grid &par,
-            cufftDoubleComplex *gpuParSum, int numSteps,
-            unsigned int gstate,
-            std::string buffer);
-
 // UPDATE LIST LATER
  /**
  * @brief       performs real or imaginary time evolution
@@ -68,7 +55,7 @@ void evolve_2d(Grid &par,
  * @return      0 for success. See CUDA failure codes in cuda.h for other values
  */
 void evolve(Grid &par,
-            cufftDoubleComplex *gpuParSum, int numSteps,
+            int numSteps,
             unsigned int gstate,
             std::string buffer);
 
diff --git a/include/kernels.h b/include/kernels.h
@@ -211,7 +211,7 @@ __global__ void scalarDiv2D(double2*, double2*);
 * @param	dr Smallest area element of grid (dx*dy)
 * @param	pSum GPU array used to store intermediate results during parallel summation
 */
-__global__ void scalarDiv_wfcNorm(double2* in, double dr, double2* pSum, double2* out);
+__global__ void scalarDiv_wfcNorm(double2* in, double dr, double* pSum, double2* out);
 
 //##############################################################################
 
@@ -238,7 +238,7 @@ __global__ void thread_test(double* input, double* output);
 * @param	pass Number of passes performed by routine
 */
 __global__ void multipass(double2* input, double2* output, int pass);
-__global__ void multipass(double* input, double* output, int pass);
+__global__ void multipass(double* input, double* output);
 
 //##############################################################################
 
diff --git a/include/split_op.h b/include/split_op.h
@@ -63,8 +63,8 @@ int isError(int result, char* c); //Checks to see if an error has occurred.
 * @param	threads Number of CUDA threads for operation
 * @return	0 for success. See CUDA failure codes in cuda.h for other values.
 */
-void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par);
-void parSum(double* gpuWfc, double* gpuParSum, Grid &par);
+void parSum(double2* gpuWfc, Grid &par);
+void parSum(double* gpuWfc, double *gpuParSum, Grid &par);
 
 /**
 * @brief	Creates the optical lattice to match the vortex lattice constant
diff --git a/src/evolution.cu b/src/evolution.cu
@@ -2,9 +2,9 @@
 #include "../include/vortex_3d.h"
 
 void evolve(Grid &par,
-               double2* gpuParSum, int numSteps,
-               unsigned int gstate,
-               std::string buffer){
+            int numSteps,
+            unsigned int gstate,
+            std::string buffer){
 
     // Re-establishing variables from parsed Grid class
     std::string data_dir = par.sval("data_dir");
@@ -875,7 +875,7 @@ void evolve(Grid &par,
         }
 
         if(gstate==0){
-            parSum(gpuWfc, gpuParSum, par);
+            parSum(gpuWfc, par);
         }
     }
 
diff --git a/src/init.cu b/src/init.cu
@@ -44,8 +44,6 @@ int init(Grid &par){
     cufftDoubleComplex *EV_opt;
     cufftDoubleComplex *wfc_backup;
     cufftDoubleComplex *EappliedField;
-    double2 *par_sum;
-    cudaMalloc((void**) &par_sum, sizeof(double2)*gSize);
 
     std::cout << "gSize is: " << gSize << '\n';
     cufftResult result;
@@ -318,7 +316,6 @@ int init(Grid &par){
     par.store("V_opt", V_opt);
     par.store("wfc_backup", wfc_backup);
     par.store("EappliedField", EappliedField);
-    par.store("par_sum", par_sum);
 
     par.store("result", result);
     par.store("plan_1d", plan_1d);
@@ -464,24 +461,26 @@ void set_variables(Grid &par, bool ev_type){
         cufftDoubleComplex *EpAx = par.cufftDoubleComplexval("EpAx");
         cufftDoubleComplex *EpAy = nullptr;
         cufftDoubleComplex *EpAz = nullptr;
-        if(!par.bval("Ax_time")){
-            err=cudaMemcpy(pAx_gpu, EpAx, sizeof(cufftDoubleComplex)*gsize,
+        if (!par.bval("K_time")){
+            err=cudaMemcpy(K_gpu, EK, sizeof(cufftDoubleComplex)*gsize,
                            cudaMemcpyHostToDevice);
             if(err!=cudaSuccess){
-                std::cout << "ERROR: Could not copy pAx_gpu to device" << '\n';
+                std::cout << "ERROR: Could not copy K_gpu to device" << '\n';
                 exit(1);
             }
-            par.store("pAx_gpu", pAx_gpu);
+            par.store("K_gpu", K_gpu);
         }
-        if (!par.bval("K_time")){
-            err=cudaMemcpy(K_gpu, EK, sizeof(cufftDoubleComplex)*gsize,
+        if(!par.bval("Ax_time")){
+            err=cudaMemcpy(pAx_gpu, EpAx, sizeof(cufftDoubleComplex)*gsize,
                            cudaMemcpyHostToDevice);
             if(err!=cudaSuccess){
-                std::cout << "ERROR: Could not copy K_gpu to device" << '\n';
+                std::cout << "ERROR: Could not copy pAx_gpu to device" << '\n';
+                std::cout << err << '\n';
                 exit(1);
             }
-            par.store("K_gpu", K_gpu);
+            par.store("pAx_gpu", pAx_gpu);
         }
+
         if (!par.bval("V_time")){
             err=cudaMemcpy(V_gpu, EV, sizeof(cufftDoubleComplex)*gsize,
                            cudaMemcpyHostToDevice);
@@ -574,7 +573,6 @@ int main(int argc, char **argv){
 
     init(par);
 
-    cufftDoubleComplex *par_sum = par.cufftDoubleComplexval("par_sum");
     int gsteps = par.ival("gsteps");
     int esteps = par.ival("esteps");
     std::string data_dir = par.sval("data_dir");
@@ -588,13 +586,13 @@ int main(int argc, char **argv){
         std::cout << "Imaginary-time evolution started..." << '\n';
         set_variables(par, 0);
 
-        evolve(par, par_sum, gsteps, 0, buffer);
+        evolve(par, gsteps, 0, buffer);
     }
 
     if(esteps > 0){
         std::cout << "real-time evolution started..." << '\n';
         set_variables(par, 1);
-        evolve(par, par_sum, esteps, 1, buffer);
+        evolve(par, esteps, 1, buffer);
     }
 
     std::cout << "done evolving" << '\n';
diff --git a/src/kernels.cu b/src/kernels.cu
@@ -355,10 +355,10 @@ __global__ void scalarMult(double2* in, double factor, double2* out){
 /**
  * As above, but normalises for wfc
  */
-__global__ void scalarDiv_wfcNorm(double2* in, double dr, double2* pSum, double2* out){
+__global__ void scalarDiv_wfcNorm(double2* in, double dr, double* pSum, double2* out){
     unsigned int gid = getGid3d3d();
     double2 result;
-    double norm = sqrt((pSum[0].x + pSum[0].y)*dr);
+    double norm = sqrt((pSum[0])*dr);
     result.x = (in[gid].x/norm);
     result.y = (in[gid].y/norm);
     out[gid] = result;
@@ -439,7 +439,7 @@ __global__ void multipass(double2* input, double2* output, int pass){
 /**
  * Routine for parallel summation. Can be looped over from host.
  */
-__global__ void multipass(double* input, double* output, int pass){
+__global__ void multipass(double* input, double* output){
     unsigned int tid = threadIdx.x + threadIdx.y*blockDim.x
                        + threadIdx.z * blockDim.x * blockDim.y;
     unsigned int bid = blockIdx.x + blockIdx.y * gridDim.x
@@ -453,6 +453,7 @@ __global__ void multipass(double* input, double* output, int pass){
     extern __shared__ double sdatad[];
     sdatad[tid] = input[gid];
     __syncthreads();
+
     for(int i = blockDim.x>>1; i > 0; i>>=1){
         if(tid < i){
             sdatad[tid] += sdatad[tid + i];
diff --git a/src/split_op.cu b/src/split_op.cu
@@ -79,30 +79,26 @@ void parSum(double* gpuWfc, double* gpuParSum, Grid &par){
     dim3 thread_tmp = threads;
     int pass = 0;
 
+    set_eq<<<par.grid, par.threads>>>(gpuWfc, gpuParSum);
+
     dim3 grid = par.grid;
     while((double)grid_tmp.x/threads.x > 1.0){
-        if(pass == 0){
-            multipass<<<block,threads,threads.x*sizeof(double2)>>>(&gpuWfc[0],
-                &gpuParSum[0],pass);
-        }
-        else{
-            multipass<<<block,thread_tmp,thread_tmp.x*sizeof(double2)>>>(
-                &gpuParSum[0],&gpuParSum[0],pass);
-        }
+        multipass<<<block,thread_tmp,thread_tmp.x*sizeof(double)>>>(
+            &gpuParSum[0],&gpuParSum[0]);
         grid_tmp.x /= threads.x;
         block = (int) ceil((double)grid_tmp.x/threads.x);
         pass++;
         //std::cout << grid_tmp.x << '\n';
     }
     thread_tmp = grid_tmp.x;
     multipass<<<1,thread_tmp,thread_tmp.x*sizeof(double2)>>>(&gpuParSum[0],
-                                                           &gpuParSum[0], pass);
+                                                           &gpuParSum[0]);
 }
 
 /*
  * Used to perform parallel summation on WFC for normalisation.
  */
-void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
+void parSum(double2* gpuWfc, Grid &par){
     // May need to add double l
     int dimnum = par.ival("dimnum");
     double dx = par.dval("dx");
@@ -131,8 +127,8 @@ void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
     dim3 thread_tmp = threads;
     int pass = 0;
 
-    cufftDoubleComplex *density;
-    cudaMalloc((void**) &density, sizeof(double2)*gsize);
+    double *density;
+    cudaMalloc((void**) &density, sizeof(double)*gsize);
 
     complexMagnitudeSquared<<<par.grid, par.threads>>>(gpuWfc, density);
 
@@ -144,32 +140,26 @@ void parSum(double2* gpuWfc, double2* gpuParSum, Grid &par){
 */
     dim3 grid = par.grid;
     while((double)grid_tmp.x/threads.x > 1.0){
-        if(pass == 0){
-            multipass<<<block,threads,threads.x*sizeof(double2)>>>(&density[0],
-                &gpuParSum[0],pass);
-        }
-        else{
-            multipass<<<block,thread_tmp,thread_tmp.x*sizeof(double2)>>>(
-                &gpuParSum[0],&gpuParSum[0],pass);
-        }
+        multipass<<<block,threads,threads.x*sizeof(double)>>>(&density[0],
+                                                              &density[0]);
         grid_tmp.x /= threads.x;
         block = (int) ceil((double)grid_tmp.x/threads.x);
         pass++;
-        //std::cout << grid_tmp.x << '\n';
+        //std::cout << pass << '\t' << grid_tmp.x << '\n';
     }
     thread_tmp = grid_tmp.x;
-    multipass<<<1,thread_tmp,thread_tmp.x*sizeof(double2)>>>(&gpuParSum[0],
-                                                           &gpuParSum[0], pass);
+    multipass<<<1,thread_tmp,thread_tmp.x*sizeof(double)>>>(&density[0],
+                                                            &density[0]);
 
-    // Writing out in the parSum Function (not recommended, for debugging)
 /*
-    double2 *sum;
-    sum = (cufftDoubleComplex *) malloc(sizeof(cufftDoubleComplex)*gsize / threads.x);
-    cudaMemcpy(sum,gpuParSum,sizeof(cufftDoubleComplex)*gsize/threads.x,
+    // Writing out in the parSum Function (not recommended, for debugging)
+    double *sum;
+    sum = (double *) malloc(sizeof(double)*gsize);
+    cudaMemcpy(sum,density,sizeof(double)*gsize,
                cudaMemcpyDeviceToHost);
-    std::cout << sqrt((sum[0].x + sum[0].y)*dg) << '\n';
+    std::cout << (sum[0]) << '\n';
 */
-    scalarDiv_wfcNorm<<<grid,threads>>>(gpuWfc, dg, gpuParSum, gpuWfc);
+    scalarDiv_wfcNorm<<<par.grid,par.threads>>>(gpuWfc, dg, density, gpuWfc);
 
     cudaFree(density);
 }
@@ -304,7 +294,6 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
 
     cudaMalloc((void**) &energy_gpu, sizeof(double2)*gSize);
     cudaMalloc((void**) &tmp_wfc, sizeof(double2)*gSize);
-    cudaMalloc((void**) &op, sizeof(double2)*gSize);
 
 
     for (int i=0; i < gSize; ++i){
@@ -327,8 +316,10 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
     energyCalc<<<grid,threads>>>(tmp_wfc, op, dt, energy_gpu, gState,op_space,
                                  0.5*sqrt(omegaZ/mass), gDenConst);
     result = cufftExecZ2Z( plan, energy_gpu, energy_gpu, CUFFT_INVERSE );
+    result = cufftExecZ2Z( plan, tmp_wfc, tmp_wfc, CUFFT_INVERSE );
 
     scalarMult<<<grid,threads>>>(energy_gpu, renorm_factor, energy_gpu);
+    scalarMult<<<grid,threads>>>(tmp_wfc, renorm_factor, tmp_wfc);
 
     if (corotating){
         op_space = 0;
@@ -339,7 +330,7 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
 
     op = par.cufftDoubleComplexval("V_gpu");
 
-    energyCalc<<<grid,threads>>>(gpuWfc, op, dt, energy_gpu, gState,op_space,
+    energyCalc<<<grid,threads>>>(tmp_wfc, op, dt, energy_gpu, gState,op_space,
                                  0.5*sqrt(omegaZ/mass), gDenConst);
 
     err=cudaMemcpy(energy, energy_gpu, 
@@ -356,7 +347,6 @@ double energy_angmom(double2 *gpuWfc, int gState, Grid &par){
 
     cudaFree(energy_gpu);
     cudaFree(tmp_wfc);
-    cudaFree(op);
     free(energy);
     return out*dx*dy*dz;
 
diff --git a/src/unit_test.cu b/src/unit_test.cu
diff --git a/src/vortex_3d.cu b/src/vortex_3d.cu

Original file line number	Diff line number	Diff line change
`@@ -2,9 +2,9 @@`
`2`	`2`	`#include "../include/vortex_3d.h"`
`3`	`3`
`4`	`4`	`void evolve(Grid &par,`
`5`		`- double2* gpuParSum, int numSteps,`
`6`		`- unsigned int gstate,`
`7`		`- std::string buffer){`
	`5`	`+ int numSteps,`
	`6`	`+ unsigned int gstate,`
	`7`	`+ std::string buffer){`
`8`	`8`
`9`	`9`	`// Re-establishing variables from parsed Grid class`
`10`	`10`	`std::string data_dir = par.sval("data_dir");`
`@@ -875,7 +875,7 @@ void evolve(Grid &par,`
`875`	`875`	`}`
`876`	`876`
`877`	`877`	`if(gstate==0){`
`878`		`- parSum(gpuWfc, gpuParSum, par);`
	`878`	`+ parSum(gpuWfc, par);`
`879`	`879`	`}`
`880`	`880`	`}`
`881`	`881`