From 4f9c2b53e426a681af570b95dac8e68aa327b12a Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Mon, 20 Jun 2016 13:15:19 -0400 Subject: [PATCH] Refactor MPI only and MPI+GPU code. --- puremd_rc_1003/PG-PuReMD/src/neighbors.c | 2 +- puremd_rc_1003/PG-PuReMD/src/parallelreax.c | 142 +++++++++++++------- puremd_rc_1003/PG-PuReMD/src/tool_box.c | 2 +- 3 files changed, 93 insertions(+), 53 deletions(-) diff --git a/puremd_rc_1003/PG-PuReMD/src/neighbors.c b/puremd_rc_1003/PG-PuReMD/src/neighbors.c index df1c998e..25a4fd65 100644 --- a/puremd_rc_1003/PG-PuReMD/src/neighbors.c +++ b/puremd_rc_1003/PG-PuReMD/src/neighbors.c @@ -240,7 +240,7 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists ) } } - fprintf (stderr, "Total numner of host neighbors: %d \n", num_far); + fprintf (stderr, "Total number of host neighbors: %d \n", num_far); #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d: estimate nbrs done - num_far=%d\n", diff --git a/puremd_rc_1003/PG-PuReMD/src/parallelreax.c b/puremd_rc_1003/PG-PuReMD/src/parallelreax.c index e7c700b2..9691d40e 100644 --- a/puremd_rc_1003/PG-PuReMD/src/parallelreax.c +++ b/puremd_rc_1003/PG-PuReMD/src/parallelreax.c @@ -177,8 +177,9 @@ int main( int argc, char* argv[] ) real t_start = 0, t_elapsed; real t_begin, t_end; - /* Remove this debug information later */ #ifdef HAVE_CUDA + + /* Remove this debug information later */ #if defined(__CUDA_DEBUG_LOG__) fprintf (stderr, " Size of LR Lookup table %d \n", sizeof (LR_lookup_table) ); #endif @@ -197,7 +198,6 @@ int main( int argc, char* argv[] ) fprintf (stderr, " matvec threads per atom: %d \n", MATVEC_KER_THREADS_PER_ROW); fprintf (stderr, " General block size: %d \n", DEF_BLOCK_SIZE); -#endif #endif /* allocated main datastructures */ @@ -221,7 +221,6 @@ int main( int argc, char* argv[] ) smalloc( sizeof(output_controls), "out_control" ); mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" ); -#ifdef HAVE_CUDA /* allocate the cuda auxiliary data structures */ dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" ); @@ -230,7 +229,6 @@ int main( int argc, char* argv[] ) dev_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" ); dev_lists[i]->allocated = 0; } -#endif /* Initialize member variables */ system->init_thblist = FALSE; @@ -243,7 +241,6 @@ int main( int argc, char* argv[] ) system->wsize = control->nprocs; system->global_offset = (int *)scalloc(system->wsize+1,sizeof(int),"global_offset"); -#ifdef HAVE_CUDA /* setup the CUDA Device for this process can be on the same machine * or on a different machine, for now use the rank to compute the device * This will only work on a single machine with 2 GPUs*/ @@ -252,7 +249,6 @@ int main( int argc, char* argv[] ) //Cleanup_Cuda_Environment (); print_device_mem_usage (); //fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id); -#endif /* read system description files */ Read_System( argv[1], argv[2], argv[3], system, control, @@ -262,32 +258,23 @@ int main( int argc, char* argv[] ) MPI_Barrier( MPI_COMM_WORLD ); #endif -#ifdef HAVE_CUDA /* init the blocks sizes for cuda kernels */ init_blocks (system); -#endif /* measure total simulation time after input is read */ if( system->my_rank == MASTER_NODE ) t_start = Get_Time( ); - /* initialize datastructures */ -#ifdef HAVE_CUDA Cuda_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); #if defined(__CUDA_DEBUG__) Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); #endif -#else - Initialize( system, control, data, workspace, lists, out_control, mpi_data ); -#endif -#ifdef HAVE_CUDA print_device_mem_usage (); /* init the blocks sizes for cuda kernels */ init_blocks (system); -#endif #if defined(DEBUG) fprintf( stderr, "p%d: initializated data structures\n", system->my_rank ); @@ -295,31 +282,25 @@ print_device_mem_usage (); #endif //END OF FIRST STEP + fprintf(stderr, "===>HERE\n"); // compute f_0 Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 ); -#ifdef HAVE_CUDA Sync_Atoms ( system ); Sync_Grid (&system->my_grid, &system->d_my_grid); init_blocks (system); #if defined(__CUDA_DENUG_LOG__) fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank ); -#endif #endif //Second step -#ifdef HAVE_CUDA Cuda_Reset ( system, control, data, workspace, lists ); #if defined(__CUDA_DEBUG__) Reset( system, control, data, workspace, lists ); #endif //fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank ); -#else - Reset( system, control, data, workspace, lists ); -#endif //Third Step -#ifdef HAVE_CUDA Cuda_Generate_Neighbor_Lists( system, data, workspace, lists ); #if defined(__CUDA_DEBUG__) Generate_Neighbor_Lists( system, data, workspace, lists ); @@ -327,13 +308,9 @@ print_device_mem_usage (); #if defined(__CUDA_DENUG_LOG__) fprintf (stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank ); #endif -#else - Generate_Neighbor_Lists( system, data, workspace, lists ); -#endif //Fourth Step -#ifdef HAVE_CUDA #if defined(__CUDA_DEBUG__) fprintf (stderr, " Host Compute Forces begin.... \n"); Compute_Forces( system, control, data, workspace, @@ -344,13 +321,8 @@ print_device_mem_usage (); #if defined(__CUDA_DENUG_LOG__) fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank ); #endif -#else - Compute_Forces( system, control, data, workspace, - lists, out_control, mpi_data ); -#endif -#ifdef HAVE_CUDA #if defined (__CUDA_DEBUG__) Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); #endif @@ -358,11 +330,7 @@ print_device_mem_usage (); #if defined(__CUDA_DENUG_LOG__) fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank); #endif -#else - Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); -#endif -#ifdef HAVE_CUDA #if defined(__CUDA_DEBUG__) validate_device (system, data, workspace, lists); #endif @@ -371,16 +339,10 @@ print_device_mem_usage (); Output_Results( system, control, data, lists, out_control, mpi_data ); //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank); #endif -#else - Output_Results( system, control, data, lists, out_control, mpi_data ); - //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank); -#endif -#ifdef HAVE_CUDA #if defined(DEBUG) fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank ); MPI_Barrier( MPI_COMM_WORLD ); -#endif #endif // start the simulation @@ -390,7 +352,6 @@ print_device_mem_usage (); //t_begin = Get_Time (); -#ifdef HAVE_CUDA #if defined(__CUDA_DEBUG__) Evolve( system, control, data, workspace, lists, out_control, mpi_data ); #endif @@ -407,18 +368,10 @@ print_device_mem_usage (); //t_end = Get_Timing_Info (t_begin); //fprintf (stderr, " Post Evolve time: %f \n", t_end); -#else - Evolve( system, control, data, workspace, lists, out_control, mpi_data ); - Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); -#endif -#ifdef HAVE_CUDA #if !defined(__CUDA_DEBUG__) Output_Results( system, control, data, lists, out_control, mpi_data ); #endif -#else - Output_Results( system, control, data, lists, out_control, mpi_data ); -#endif //Analysis(system, control, data, workspace, lists, out_control, mpi_data); @@ -437,11 +390,98 @@ print_device_mem_usage (); } -#ifdef HAVE_CUDA //vaildate the results in debug mode #if defined(__CUDA_DEBUG__) validate_device (system, data, workspace, lists); #endif + +#else + + /* allocated main datastructures */ + system = (reax_system *) + smalloc( sizeof(reax_system), "system" ); + control = (control_params *) + smalloc( sizeof(control_params), "control" ); + data = (simulation_data *) + smalloc( sizeof(simulation_data), "data" ); + workspace = (storage *) + smalloc( sizeof(storage), "workspace" ); + lists = (reax_list **) + smalloc( LIST_N * sizeof(reax_list*), "lists" ); + for( i = 0; i < LIST_N; ++i ) { + lists[i] = (reax_list *) + smalloc( sizeof(reax_list), "lists[i]" ); + lists[i]->allocated = 0; + } + out_control = (output_controls *) + smalloc( sizeof(output_controls), "out_control" ); + mpi_data = (mpi_datatypes *) + smalloc( sizeof(mpi_datatypes), "mpi_data" ); + + /* setup the parallel environment */ + MPI_Init( &argc, &argv ); + MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) ); + MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) ); + system->wsize = control->nprocs; + system->global_offset = (int*) + scalloc( system->wsize+1, sizeof(int), "global_offset" ); + + /* read system description files */ + Read_System( argv[1], argv[2], argv[3], system, control, + data, workspace, out_control, mpi_data ); +#if defined(DEBUG) + fprintf( stderr, "p%d: read simulation info\n", system->my_rank ); + MPI_Barrier( MPI_COMM_WORLD ); +#endif + + /* measure total simulation time after input is read */ + if( system->my_rank == MASTER_NODE ) + t_start = Get_Time( ); + + /* initialize datastructures */ + Initialize( system, control, data, workspace, lists, out_control, mpi_data ); +#if defined(DEBUG) + fprintf( stderr, "p%d: initializated data structures\n", system->my_rank ); + MPI_Barrier( mpi_data->world ); +#endif + + /* compute f_0 */ + Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 ); + Reset( system, control, data, workspace, lists ); + Generate_Neighbor_Lists( system, data, workspace, lists ); + Compute_Forces( system, control, data, workspace, + lists, out_control, mpi_data ); + Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); + Output_Results( system, control, data, lists, out_control, mpi_data ); +#if defined(DEBUG) + fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank ); + MPI_Barrier( mpi_data->world ); +#endif + + /* start the simulation */ + for( ++data->step; data->step <= control->nsteps; data->step++ ) { + if( control->T_mode ) + Temperature_Control( control, data ); + + Evolve( system, control, data, workspace, lists, out_control, mpi_data ); + Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); + Output_Results( system, control, data, lists, out_control, mpi_data ); + //Analysis(system, control, data, workspace, lists, out_control, mpi_data); + + /* dump restart info */ + if( out_control->restart_freq && + (data->step-data->prev_steps) % out_control->restart_freq == 0 ) { + if( out_control->restart_format == WRITE_ASCII ) + Write_Restart( system, control, data, out_control, mpi_data ); + else if( out_control->restart_format == WRITE_BINARY ) + Write_Binary_Restart( system, control, data, out_control, mpi_data ); + } +#if defined(DEBUG) + fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step ); + MPI_Barrier( mpi_data->world ); +#endif + } + #endif /* end of the simulation, write total simulation time */ diff --git a/puremd_rc_1003/PG-PuReMD/src/tool_box.c b/puremd_rc_1003/PG-PuReMD/src/tool_box.c index 8c0caae0..2e4c790b 100644 --- a/puremd_rc_1003/PG-PuReMD/src/tool_box.c +++ b/puremd_rc_1003/PG-PuReMD/src/tool_box.c @@ -398,7 +398,7 @@ int Tokenize( char* s, char*** tok ) /***************** taken from lammps ************************/ /* safe malloc */ -void *smalloc( long n, char *name ) +void* smalloc( long n, char *name ) { void *ptr; -- GitLab