From e9e6455b8a3a21df87ce96105ab3bf31f31b282d Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Wed, 16 Sep 2020 15:13:21 -0400
Subject: [PATCH] PG-PuReMD: switch timers to use CUDA events and add
 additional timers for kernels to match MPI code. Other code cleanup.

---
 PG-PuReMD/src/box.c                    |  10 +-
 PG-PuReMD/src/cuda/cuda_environment.cu |  12 +--
 PG-PuReMD/src/cuda/cuda_forces.cu      | 135 ++++++++++++++++++++++---
 PG-PuReMD/src/cuda/cuda_init_md.cu     |   7 +-
 PG-PuReMD/src/cuda/cuda_neighbors.cu   |  29 +++++-
 PG-PuReMD/src/io_tools.c               |  75 ++++++++------
 PG-PuReMD/src/puremd.c                 |   2 +-
 PG-PuReMD/src/restart.c                |   4 +-
 8 files changed, 208 insertions(+), 66 deletions(-)

diff --git a/PG-PuReMD/src/box.c b/PG-PuReMD/src/box.c
index effc6a90..a223eebe 100644
--- a/PG-PuReMD/src/box.c
+++ b/PG-PuReMD/src/box.c
@@ -287,17 +287,21 @@ void Setup_Boundary_Cutoffs( reax_system * const system, control_params * const
 void Setup_Environment( reax_system * const system, control_params * const control,
         mpi_datatypes * const mpi_data )
 {
+    int ret;
     ivec periodic = {1, 1, 1};
 #if defined(DEBUG_FOCUS)
     char temp[100];
 #endif
 
     /* initialize communicator - 3D mesh with wrap-arounds = 3D torus */
-    MPI_Cart_create( MPI_COMM_WORLD, 3, control->procs_by_dim, periodic, 1,
+    ret = MPI_Cart_create( MPI_COMM_WORLD, 3, control->procs_by_dim, periodic, 1,
             &mpi_data->comm_mesh3D );
-    MPI_Comm_rank( mpi_data->comm_mesh3D, &system->my_rank );
-    MPI_Cart_coords( mpi_data->comm_mesh3D, system->my_rank, 3,
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+    ret = MPI_Comm_rank( mpi_data->comm_mesh3D, &system->my_rank );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
+    ret = MPI_Cart_coords( mpi_data->comm_mesh3D, system->my_rank, 3,
             system->my_coords );
+    Check_MPI_Error( ret, __FILE__, __LINE__ );
 
     Setup_Boundary_Cutoffs( system, control );
     Setup_My_Box( system, control );
diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu
index 0445189f..1e11bc32 100644
--- a/PG-PuReMD/src/cuda/cuda_environment.cu
+++ b/PG-PuReMD/src/cuda/cuda_environment.cu
@@ -20,11 +20,11 @@ extern "C" void Cuda_Setup_Environment( int rank, int nprocs, int gpus_per_node
 {
 
     int deviceCount;
-    cudaError_t flag;
+    cudaError_t ret;
     
-    flag = cudaGetDeviceCount( &deviceCount );
+    ret = cudaGetDeviceCount( &deviceCount );
 
-    if ( flag != cudaSuccess || deviceCount < 1 )
+    if ( ret != cudaSuccess || deviceCount < 1 )
     {
         fprintf( stderr, "[ERROR] no CUDA capable device(s) found. Terminating...\n" );
         exit( CANNOT_INITIALIZE );
@@ -38,15 +38,15 @@ extern "C" void Cuda_Setup_Environment( int rank, int nprocs, int gpus_per_node
 
     /* assign the GPU for each process */
     //TODO: handle condition where # CPU procs > # GPUs
-    flag = cudaSetDevice( rank % gpus_per_node );
+    ret = cudaSetDevice( rank % gpus_per_node );
 
-    if ( flag == cudaErrorInvalidDevice )
+    if ( ret == cudaErrorInvalidDevice )
     {
         fprintf( stderr, "[ERROR] invalid CUDA device ID set (%d). Terminating...\n",
               rank % gpus_per_node );
         exit( CANNOT_INITIALIZE );
     }
-    else if ( flag == cudaErrorDeviceAlreadyInUse )
+    else if ( ret == cudaErrorDeviceAlreadyInUse )
     {
         fprintf( stderr, "[ERROR] CUDA device with specified ID already in use (%d). Terminating...\n",
                 rank % gpus_per_node );
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 6e18ff81..3bdb4d3e 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1474,9 +1474,13 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     int renbr, blocks, ret, realloc_bonds, realloc_hbonds, realloc_cm;
     static int dist_done = FALSE, cm_done = FALSE, bonds_done = FALSE;
 #if defined(LOG_PERFORMANCE)
-    double time;
+    float time_elapsed;
+    cudaEvent_t time_event[4];
     
-    time = Get_Time( );
+    for ( int i = 0; i < 4; ++i )
+    {
+        cudaEventCreate( &time_event[i] );
+    }
 #endif
 
     renbr = (data->step - data->prev_steps) % control->reneighbor == 0 ? TRUE : FALSE;
@@ -1498,6 +1502,10 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
             "Cuda_Init_Forces::d_realloc_cm_entries" );
 
+#if defined(LOG_PERFORMANCE)
+    cudaEventRecord( time_event[0] );
+#endif
+
     if ( renbr == FALSE && dist_done == FALSE )
     {
         k_init_distance <<< control->blocks_n, control->block_size_n >>>
@@ -1508,7 +1516,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     }
 
 #if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_dist );
+    cudaEventRecord( time_event[1] );
 #endif
 
     blocks = workspace->d_workspace->H.n_max / DEF_BLOCK_SIZE
@@ -1561,7 +1569,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     }
 
 #if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_cm );
+    cudaEventRecord( time_event[2] );
 #endif
 
     if ( bonds_done == FALSE )
@@ -1578,7 +1586,7 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     }
 
 #if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_bond );
+    cudaEventRecord( time_event[3] );
 #endif
 
     /* check reallocation flags on device */
@@ -1589,6 +1597,42 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
     copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
 
+#if defined(LOG_PERFORMANCE)
+    if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[0] );
+    }
+
+    if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[1] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); 
+    data->timing.init_dist += (real) (time_elapsed / 1000.0);
+
+    if ( cudaEventQuery( time_event[2] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[2] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[1], time_event[2] ); 
+    data->timing.init_cm += (real) (time_elapsed / 1000.0);
+
+    if ( cudaEventQuery( time_event[3] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[3] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[2], time_event[3] ); 
+    data->timing.init_bond += (real) (time_elapsed / 1000.0);
+    
+    for ( int i = 0; i < 4; ++i )
+    {
+        cudaEventDestroy( time_event[i] );
+    }
+#endif
+
     ret = (realloc_cm == FALSE && realloc_bonds == FALSE && realloc_hbonds == FALSE)
         ? SUCCESS : FAILURE;
 
@@ -1996,9 +2040,13 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
     int charge_flag, ret;
     static int init_forces_done = FALSE;
 #if defined(LOG_PERFORMANCE)
-    real time;
-
-    time = Get_Time( );
+    float time_elapsed;
+    cudaEvent_t time_event[6];
+    
+    for ( int i = 0; i < 6; ++i )
+    {
+        cudaEventCreate( &time_event[i] );
+    }
 #endif
 
     ret = SUCCESS;
@@ -2013,6 +2061,10 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
         charge_flag = FALSE;
     }
 
+#if defined(LOG_PERFORMANCE)
+    cudaEventRecord( time_event[0] );
+#endif
+
     if ( init_forces_done == FALSE )
     {
         if ( charge_flag == TRUE )
@@ -2033,18 +2085,18 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
     }
 
 #if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.init_forces );
+    cudaEventRecord( time_event[1] );
 #endif
 
     if ( ret == SUCCESS )
     {
         ret = Cuda_Compute_Bonded_Forces( system, control, data,
                 workspace, lists, out_control );
+    }
 
 #if defined(LOG_PERFORMANCE)
-        Update_Timing_Info( &time, &data->timing.bonded );
+    cudaEventRecord( time_event[2] );
 #endif
-    }
 
     if ( ret == SUCCESS )
     {
@@ -2055,24 +2107,79 @@ extern "C" int Cuda_Compute_Forces( reax_system *system, control_params *control
         }
 
 #if defined(LOG_PERFORMANCE)
-        Update_Timing_Info( &time, &data->timing.cm );
+        cudaEventRecord( time_event[3] );
 #endif
 
         Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
                 lists, out_control, mpi_data );
 
 #if defined(LOG_PERFORMANCE)
-        Update_Timing_Info( &time, &data->timing.nonb );
+        cudaEventRecord( time_event[4] );
 #endif
 
         Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
 
 #if defined(LOG_PERFORMANCE)
-        Update_Timing_Info( &time, &data->timing.bonded );
+        cudaEventRecord( time_event[5] );
 #endif
 
         init_forces_done = FALSE;
     }
 
+#if defined(LOG_PERFORMANCE)
+    if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[0] );
+    }
+
+    if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[1] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); 
+    data->timing.init_forces += (real) (time_elapsed / 1000.0);
+
+    if ( cudaEventQuery( time_event[2] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[2] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[1], time_event[2] ); 
+    data->timing.bonded += (real) (time_elapsed / 1000.0);
+
+    if ( ret == SUCCESS )
+    {
+        if ( cudaEventQuery( time_event[3] ) != cudaSuccess ) 
+        {
+            cudaEventSynchronize( time_event[3] );
+        }
+
+        cudaEventElapsedTime( &time_elapsed, time_event[2], time_event[3] ); 
+        data->timing.cm += (real) (time_elapsed / 1000.0);
+
+        if ( cudaEventQuery( time_event[4] ) != cudaSuccess ) 
+        {
+            cudaEventSynchronize( time_event[4] );
+        }
+
+        cudaEventElapsedTime( &time_elapsed, time_event[3], time_event[4] ); 
+        data->timing.nonb += (real) (time_elapsed / 1000.0);
+
+        if ( cudaEventQuery( time_event[5] ) != cudaSuccess ) 
+        {
+            cudaEventSynchronize( time_event[5] );
+        }
+
+        cudaEventElapsedTime( &time_elapsed, time_event[4], time_event[5] ); 
+        data->timing.bonded += (real) (time_elapsed / 1000.0);
+    }
+    
+    for ( int i = 0; i < 6; ++i )
+    {
+        cudaEventDestroy( time_event[i] );
+    }
+#endif
+
     return ret;
 }
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index 1ba48522..12262073 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -24,8 +24,7 @@
 
 
 static void Cuda_Init_System( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        mpi_datatypes *mpi_data )
+        simulation_data *data, storage *workspace, mpi_datatypes *mpi_data )
 {
     Setup_New_Grid( system, control, MPI_COMM_WORLD );
 
@@ -110,8 +109,8 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
         control->virial = 0;
         if ( !control->restart || (control->restart && control->random_vel) )
         {
-            data->therm.G_xi = control->Tau_T *
-                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
+            data->therm.G_xi = control->Tau_T
+                * (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
             data->therm.v_xi = data->therm.G_xi * control->dt;
             data->therm.v_xi_old = 0;
             data->therm.xi = 0;
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
index d937b351..e94cd26b 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -567,9 +567,13 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
 {
     int blocks, ret, ret_far_nbr;
 #if defined(LOG_PERFORMANCE)
-    double time;
+    float time_elapsed;
+    cudaEvent_t time_event[2];
     
-    time = Get_Time( );
+    for ( int i = 0; i < 2; ++i )
+    {
+        cudaEventCreate( &time_event[i] );
+    }
 #endif
 
     /* reset reallocation flag on device */
@@ -582,6 +586,10 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
     blocks = (system->N / NBRS_BLOCK_SIZE) +
         ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
 
+#if defined(LOG_PERFORMANCE)
+    cudaEventRecord( time_event[0] );
+#endif
+
     k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->my_ext_box,
           system->d_my_grid, *(lists[FAR_NBRS]),
@@ -599,6 +607,10 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
 //              *(lists[FAR_NBRS]), system->n, system->N );
 //    cudaCheckError( );
 
+#if defined(LOG_PERFORMANCE)
+    cudaEventRecord( time_event[1] );
+#endif
+
     /* check reallocation flag on device */
     copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
             cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
@@ -607,7 +619,18 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
     workspace->d_workspace->realloc.far_nbrs = ret_far_nbr;
 
 #if defined(LOG_PERFORMANCE)
-    Update_Timing_Info( &time, &data->timing.nbrs );
+    if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[0] );
+    }
+
+    if ( cudaEventQuery( time_event[1] ) != cudaSuccess ) 
+    {
+        cudaEventSynchronize( time_event[1] );
+    }
+
+    cudaEventElapsedTime( &time_elapsed, time_event[0], time_event[1] ); 
+    data->timing.nbrs += (real) (time_elapsed / 1000.0);
 #endif
 
     return ret;
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index d97655b7..488581ed 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -110,9 +110,17 @@ void Init_Output_Files( reax_system *system, control_params *control,
             out_control->log = sfopen( temp, "w",
                     "Init_Output_Controls::output_control->log" );
 
-            fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n",
-                     "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
-                     "charges", "siters", "retries" );
+//            fprintf( out_control->log, "%6s%8s%8s%8s%8s%8s%8s%8s%8s%8s\n",
+//                     "step", "total", "comm", "nbrs", "init", "bonded", "nonb",
+//                     "charges", "siters", "retries" );
+
+            fprintf( out_control->log, "%6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+                     "step", "total", "comm", "nbrs", "init",
+                     "init_dist", "init_cm", "init_bond",
+                     "bonded", "nonb",
+                     "cm", "cm_sort", "cm_iters", "cm_p_comp", "cm_p_app",
+                     "cm_comm", "cm_allr", "cm_spmv", "cm_vec_ops", "cm_orthog", "cm_t_solve",
+                     "retries" );
 
 #if defined(DEBUG)
             fflush( out_control->log );
@@ -1321,38 +1329,39 @@ void Output_Results( reax_system *system, control_params *control,
                     denom = 1.0;
                 }
 
-                fprintf( out_control->log,
-                        "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n",
-                        data->step, t_elapsed * denom, data->timing.comm * denom,
-                        data->timing.nbrs * denom, data->timing.init_forces * denom,
-                        data->timing.bonded * denom, data->timing.nonb * denom,
-                        data->timing.cm * denom,
-                        (int) (data->timing.cm_solver_iters * denom),
-                        data->timing.num_retries );
-
 //                fprintf( out_control->log,
-//                        "%6d %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
-//                        data->step,
-//                        t_elapsed * denom,
-//                        data->timing.comm * denom,
-//                        data->timing.nbrs * denom,
-//                        data->timing.init_forces * denom,
-//                        data->timing.init_dist * denom,
-//                        data->timing.init_cm * denom,
-//                        data->timing.init_bond * denom,
-//                        data->timing.bonded * denom,
-//                        (data->timing.nonb + data->timing.cm) * denom,
+//                        "%6d%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8.3f%8d%8d\n",
+//                        data->step, t_elapsed * denom, data->timing.comm * denom,
+//                        data->timing.nbrs * denom, data->timing.init_forces * denom,
+//                        data->timing.bonded * denom, data->timing.nonb * denom,
 //                        data->timing.cm * denom,
-//                        data->timing.cm_sort * denom,
-//                        (double) (data->timing.cm_solver_iters * denom),
-//                        data->timing.cm_solver_pre_comp * denom,
-//                        data->timing.cm_solver_pre_app * denom,
-//                        data->timing.cm_solver_comm * denom,
-//                        data->timing.cm_solver_allreduce * denom,
-//                        data->timing.cm_solver_spmv * denom,
-//                        data->timing.cm_solver_vector_ops * denom,
-//                        data->timing.cm_solver_orthog * denom,
-//                        data->timing.cm_solver_tri_solve * denom );
+//                        (int) (data->timing.cm_solver_iters * denom),
+//                        data->timing.num_retries );
+
+                fprintf( out_control->log,
+                        "%6d %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.2f\n",
+                        data->step,
+                        t_elapsed * denom,
+                        data->timing.comm * denom,
+                        data->timing.nbrs * denom,
+                        data->timing.init_forces * denom,
+                        data->timing.init_dist * denom,
+                        data->timing.init_cm * denom,
+                        data->timing.init_bond * denom,
+                        data->timing.bonded * denom,
+                        (data->timing.nonb + data->timing.cm) * denom,
+                        data->timing.cm * denom,
+                        data->timing.cm_sort * denom,
+                        (double) (data->timing.cm_solver_iters * denom),
+                        data->timing.cm_solver_pre_comp * denom,
+                        data->timing.cm_solver_pre_app * denom,
+                        data->timing.cm_solver_comm * denom,
+                        data->timing.cm_solver_allreduce * denom,
+                        data->timing.cm_solver_spmv * denom,
+                        data->timing.cm_solver_vector_ops * denom,
+                        data->timing.cm_solver_orthog * denom,
+                        data->timing.cm_solver_tri_solve * denom,
+                        data->timing.num_retries * denom );
 
 #if defined(DEBUG)
                 fflush( out_control->log );
diff --git a/PG-PuReMD/src/puremd.c b/PG-PuReMD/src/puremd.c
index 7b253903..5b660346 100644
--- a/PG-PuReMD/src/puremd.c
+++ b/PG-PuReMD/src/puremd.c
@@ -303,7 +303,7 @@ int simulate( const void * const handle )
         {
             ret = SUCCESS;
 
-            if ( control->T_mode && retries == 0 )
+            if ( control->T_mode > 0 && retries == 0 )
             {
                 Temperature_Control( control, data );
             }
diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c
index ef932e36..67b06b04 100644
--- a/PG-PuReMD/src/restart.c
+++ b/PG-PuReMD/src/restart.c
@@ -74,7 +74,7 @@ void Write_Binary_Restart_File( reax_system *system, control_params *control,
     /* fill in the buffers */
     for ( i = 0 ; i < system->n; ++i )
     {
-        p_atom = &(system->my_atoms[i]);
+        p_atom = &system->my_atoms[i];
         buffer[i].orig_id = p_atom->orig_id;
         buffer[i].type = p_atom->type;
         strncpy( buffer[i].name, p_atom->name, sizeof(buffer[i].name) - 1 );
@@ -97,7 +97,7 @@ void Write_Binary_Restart_File( reax_system *system, control_params *control,
         {
             if ( i != MASTER_NODE )
             {
-                ret = MPI_Recv( buffer + top, system->bigN - top, mpi_data->restart_atom_type,
+                ret = MPI_Recv( &buffer[top], system->bigN - top, mpi_data->restart_atom_type,
                           i, np * RESTART_ATOMS + i, MPI_COMM_WORLD, &status );
                 Check_MPI_Error( ret, __FILE__, __LINE__ );
                 MPI_Get_count( &status, mpi_data->restart_atom_type, &cnt );
-- 
GitLab