From 4f9c2b53e426a681af570b95dac8e68aa327b12a Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Mon, 20 Jun 2016 13:15:19 -0400
Subject: [PATCH] Refactor MPI only and MPI+GPU code.

---
 puremd_rc_1003/PG-PuReMD/src/neighbors.c    |   2 +-
 puremd_rc_1003/PG-PuReMD/src/parallelreax.c | 142 +++++++++++++-------
 puremd_rc_1003/PG-PuReMD/src/tool_box.c     |   2 +-
 3 files changed, 93 insertions(+), 53 deletions(-)

diff --git a/puremd_rc_1003/PG-PuReMD/src/neighbors.c b/puremd_rc_1003/PG-PuReMD/src/neighbors.c
index df1c998e..25a4fd65 100644
--- a/puremd_rc_1003/PG-PuReMD/src/neighbors.c
+++ b/puremd_rc_1003/PG-PuReMD/src/neighbors.c
@@ -240,7 +240,7 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
       }
 	}
 
-	fprintf (stderr, "Total numner of host neighbors: %d \n", num_far);
+	fprintf (stderr, "Total number of host neighbors: %d \n", num_far);
   
 #if defined(DEBUG_FOCUS)
   fprintf( stderr, "p%d: estimate nbrs done - num_far=%d\n", 
diff --git a/puremd_rc_1003/PG-PuReMD/src/parallelreax.c b/puremd_rc_1003/PG-PuReMD/src/parallelreax.c
index e7c700b2..9691d40e 100644
--- a/puremd_rc_1003/PG-PuReMD/src/parallelreax.c
+++ b/puremd_rc_1003/PG-PuReMD/src/parallelreax.c
@@ -177,8 +177,9 @@ int main( int argc, char* argv[] )
   real t_start = 0, t_elapsed;
   real t_begin, t_end;
 
-  /* Remove this debug information later */
 #ifdef HAVE_CUDA
+
+  /* Remove this debug information later */
 #if defined(__CUDA_DEBUG_LOG__)
   fprintf (stderr, " Size of LR Lookup table %d \n", sizeof (LR_lookup_table) );
 #endif
@@ -197,7 +198,6 @@ int main( int argc, char* argv[] )
   fprintf (stderr, " matvec threads per atom: %d \n",  MATVEC_KER_THREADS_PER_ROW);
 
   fprintf (stderr, " General block size: %d \n",  DEF_BLOCK_SIZE);
-#endif
 #endif
 
   /* allocated main datastructures */
@@ -221,7 +221,6 @@ int main( int argc, char* argv[] )
     smalloc( sizeof(output_controls), "out_control" );
   mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" );
 
-#ifdef HAVE_CUDA
   /* allocate the cuda auxiliary data structures */
   dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" );
 
@@ -230,7 +229,6 @@ int main( int argc, char* argv[] )
     dev_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
     dev_lists[i]->allocated = 0;
   }
-#endif
 
   /* Initialize member variables */
   system->init_thblist = FALSE;
@@ -243,7 +241,6 @@ int main( int argc, char* argv[] )
   system->wsize = control->nprocs;
   system->global_offset = (int *)scalloc(system->wsize+1,sizeof(int),"global_offset");
 
-#ifdef HAVE_CUDA
   /* setup the CUDA Device for this process can be on the same machine
 	* or on a different machine, for now use the rank to compute the device
 	* This will only work on a single machine with 2 GPUs*/
@@ -252,7 +249,6 @@ int main( int argc, char* argv[] )
   //Cleanup_Cuda_Environment ();
   print_device_mem_usage ();
   //fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id);
-#endif
 
   /* read system description files */
   Read_System( argv[1], argv[2], argv[3], system, control, 
@@ -262,32 +258,23 @@ int main( int argc, char* argv[] )
   MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
-#ifdef HAVE_CUDA
   /* init the blocks sizes for cuda kernels */
   init_blocks (system);
-#endif
 
   /* measure total simulation time after input is read */
   if( system->my_rank == MASTER_NODE )
     t_start = Get_Time( );
 
-
   /* initialize datastructures */
-#ifdef HAVE_CUDA
   Cuda_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
 #if defined(__CUDA_DEBUG__)
   Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
 #endif
-#else
-  Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
-#endif
 
-#ifdef HAVE_CUDA
 print_device_mem_usage ();
 
   /* init the blocks sizes for cuda kernels */
   init_blocks (system);
-#endif
 
 #if defined(DEBUG)
   fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
@@ -295,31 +282,25 @@ print_device_mem_usage ();
 #endif
   //END OF FIRST STEP
 
+  fprintf(stderr, "===>HERE\n");
   // compute f_0 
   Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
-#ifdef HAVE_CUDA
   Sync_Atoms ( system );
   Sync_Grid (&system->my_grid, &system->d_my_grid);
   init_blocks (system);
 #if defined(__CUDA_DENUG_LOG__)
   fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank );
-#endif
 #endif
 
 	//Second step
-#ifdef HAVE_CUDA
   Cuda_Reset ( system, control, data, workspace, lists );
 #if defined(__CUDA_DEBUG__)
   Reset( system, control, data, workspace, lists );
 #endif
    //fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
-#else
-  Reset( system, control, data, workspace, lists );
-#endif
 
 
 	//Third Step
-#ifdef HAVE_CUDA
   Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
 #if defined(__CUDA_DEBUG__)
   Generate_Neighbor_Lists( system, data, workspace, lists );
@@ -327,13 +308,9 @@ print_device_mem_usage ();
   #if defined(__CUDA_DENUG_LOG__)
   fprintf (stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank );
   #endif
-#else
-  Generate_Neighbor_Lists( system, data, workspace, lists );
-#endif
 
 
 	//Fourth Step
-#ifdef HAVE_CUDA
 #if defined(__CUDA_DEBUG__)
 	fprintf (stderr, " Host Compute Forces begin.... \n");
   Compute_Forces( system, control, data, workspace,  
@@ -344,13 +321,8 @@ print_device_mem_usage ();
   #if defined(__CUDA_DENUG_LOG__)
   fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank );
   #endif
-#else
-  Compute_Forces( system, control, data, workspace,  
-		  lists, out_control, mpi_data );
-#endif
 
 
-#ifdef HAVE_CUDA
 #if defined (__CUDA_DEBUG__)
   Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 #endif
@@ -358,11 +330,7 @@ print_device_mem_usage ();
   #if defined(__CUDA_DENUG_LOG__)
   fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank);
   #endif
-#else
-  Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-#endif
 
-#ifdef HAVE_CUDA
 #if defined(__CUDA_DEBUG__)
 	validate_device (system, data, workspace, lists);
 #endif
@@ -371,16 +339,10 @@ print_device_mem_usage ();
   Output_Results( system, control, data, lists, out_control, mpi_data );
   //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
 #endif
-#else
-  Output_Results( system, control, data, lists, out_control, mpi_data );
-  //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
-#endif
 
-#ifdef HAVE_CUDA
 #if defined(DEBUG)
   fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
   MPI_Barrier( MPI_COMM_WORLD );
-#endif
 #endif
 
   // start the simulation 
@@ -390,7 +352,6 @@ print_device_mem_usage ();
 
 		//t_begin = Get_Time ();
     
-#ifdef HAVE_CUDA
 #if defined(__CUDA_DEBUG__)
     Evolve( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
@@ -407,18 +368,10 @@ print_device_mem_usage ();
 
 	//t_end = Get_Timing_Info (t_begin);
 	 //fprintf (stderr, " Post Evolve time: %f \n", t_end);
-#else
-    Evolve( system, control, data, workspace, lists, out_control, mpi_data );
-    Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
-#endif
 
-#ifdef HAVE_CUDA
 #if !defined(__CUDA_DEBUG__)
   Output_Results( system, control, data, lists, out_control, mpi_data );
 #endif
-#else
-  Output_Results( system, control, data, lists, out_control, mpi_data );
-#endif
 
     //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
 
@@ -437,11 +390,98 @@ print_device_mem_usage ();
 
   }
 
-#ifdef HAVE_CUDA
   //vaildate the results in debug mode
 #if defined(__CUDA_DEBUG__)
 	validate_device (system, data, workspace, lists);
 #endif
+
+#else
+
+  /* allocated main datastructures */
+  system = (reax_system *) 
+    smalloc( sizeof(reax_system), "system" );
+  control = (control_params *) 
+    smalloc( sizeof(control_params), "control" );
+  data = (simulation_data *) 
+    smalloc( sizeof(simulation_data), "data" );
+  workspace = (storage *) 
+    smalloc( sizeof(storage), "workspace" );
+  lists = (reax_list **) 
+    smalloc( LIST_N * sizeof(reax_list*), "lists" );
+  for( i = 0; i < LIST_N; ++i ) {
+    lists[i] = (reax_list *) 
+      smalloc( sizeof(reax_list), "lists[i]" );
+    lists[i]->allocated = 0;
+  }
+  out_control = (output_controls *) 
+    smalloc( sizeof(output_controls), "out_control" );
+  mpi_data = (mpi_datatypes *) 
+    smalloc( sizeof(mpi_datatypes), "mpi_data" );
+
+  /* setup the parallel environment */
+  MPI_Init( &argc, &argv );
+  MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
+  MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
+  system->wsize = control->nprocs;
+  system->global_offset = (int*)
+    scalloc( system->wsize+1, sizeof(int), "global_offset" );
+
+  /* read system description files */
+  Read_System( argv[1], argv[2], argv[3], system, control, 
+	       data, workspace, out_control, mpi_data );
+#if defined(DEBUG)
+  fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
+  MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+  /* measure total simulation time after input is read */
+  if( system->my_rank == MASTER_NODE )
+    t_start = Get_Time( );
+
+  /* initialize datastructures */
+  Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
+#if defined(DEBUG)
+  fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
+  MPI_Barrier( mpi_data->world );
+#endif
+
+  /* compute f_0 */
+  Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
+  Reset( system, control, data, workspace, lists );
+  Generate_Neighbor_Lists( system, data, workspace, lists );
+  Compute_Forces( system, control, data, workspace,  
+		  lists, out_control, mpi_data );
+  Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+  Output_Results( system, control, data, lists, out_control, mpi_data );
+#if defined(DEBUG)
+  fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
+  MPI_Barrier( mpi_data->world );
+#endif
+
+  /* start the simulation */
+  for( ++data->step; data->step <= control->nsteps; data->step++ ) {      
+    if( control->T_mode )
+      Temperature_Control( control, data );
+    
+    Evolve( system, control, data, workspace, lists, out_control, mpi_data );
+    Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+    Output_Results( system, control, data, lists, out_control, mpi_data );
+    //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
+
+    /* dump restart info */
+    if( out_control->restart_freq && 
+	(data->step-data->prev_steps) % out_control->restart_freq == 0 ) {
+      if( out_control->restart_format == WRITE_ASCII )
+	Write_Restart( system, control, data, out_control, mpi_data );
+      else if( out_control->restart_format == WRITE_BINARY )
+	Write_Binary_Restart( system, control, data, out_control, mpi_data );
+    }
+#if defined(DEBUG)     
+    fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
+    MPI_Barrier( mpi_data->world );
+#endif
+  }
+
 #endif
   
   /* end of the simulation, write total simulation time */ 
diff --git a/puremd_rc_1003/PG-PuReMD/src/tool_box.c b/puremd_rc_1003/PG-PuReMD/src/tool_box.c
index 8c0caae0..2e4c790b 100644
--- a/puremd_rc_1003/PG-PuReMD/src/tool_box.c
+++ b/puremd_rc_1003/PG-PuReMD/src/tool_box.c
@@ -398,7 +398,7 @@ int Tokenize( char* s, char*** tok )
 
 /***************** taken from lammps ************************/
 /* safe malloc */
-void *smalloc( long n, char *name )
+void* smalloc( long n, char *name )
 {
   void *ptr;
 
-- 
GitLab