diff --git a/PuReMD/src/init_md.c b/PuReMD/src/init_md.c index 0d9bcc4006e4d6e3aa51dea91700bbb4b391a877..ef991d27c87620bc272f6c50c6e82e387819b756 100644 --- a/PuReMD/src/init_md.c +++ b/PuReMD/src/init_md.c @@ -290,6 +290,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control, data->timing.cm = ZERO; data->timing.cm_sort_mat_rows = ZERO; data->timing.cm_solver_comm = ZERO; + data->timing.cm_solver_allreduce = ZERO; data->timing.cm_solver_pre_comp = ZERO; data->timing.cm_solver_pre_app = ZERO; data->timing.cm_solver_iters = 0; @@ -363,6 +364,7 @@ int Init_Simulation_Data( reax_system *system, control_params *control, data->timing.cm = ZERO; data->timing.cm_sort_mat_rows = ZERO; data->timing.cm_solver_comm = ZERO; + data->timing.cm_solver_allreduce = ZERO; data->timing.cm_solver_pre_comp = ZERO; data->timing.cm_solver_pre_app = ZERO; data->timing.cm_solver_iters = 0; diff --git a/PuReMD/src/io_tools.c b/PuReMD/src/io_tools.c index 122b4e4eb0c73e6c29b2400e761a90577e5f4698..ef7a76c73d24496aad99cef2d7b60693fa9c98b8 100644 --- a/PuReMD/src/io_tools.c +++ b/PuReMD/src/io_tools.c @@ -113,9 +113,9 @@ int Init_Output_Files( reax_system *system, control_params *control, sprintf( temp, "%s.log", control->sim_name ); if ( (out_control->log = fopen( temp, "w" )) != NULL ) { - fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", - "step", "total", "comm", "neighbors", "init", "bonded", - "nonbonded", "CM", "CM Sort", "S iters", "S comm", "Pre Comp", "Pre App", + fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", + "step", "total", "comm", "neighbors", "init", "bonded", "nonbonded", + "CM", "CM Sort", "S iters", "Pre Comp", "Pre App", "S comm", "S allr", "S spmv", "S vec ops", "S orthog", "S tsolve" ); fflush( out_control->log ); } @@ -1267,7 +1267,7 @@ void Output_Results( reax_system *system, control_params *control, denom = 1.0 / out_control->energy_update_freq; else denom = 1; - fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n", + fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n", data->step, t_elapsed * denom, data->timing.comm * denom, @@ -1278,9 +1278,10 @@ void Output_Results( reax_system *system, control_params *control, data->timing.cm * denom, data->timing.cm_sort_mat_rows * denom, (double)( data->timing.cm_solver_iters * denom), - data->timing.cm_solver_comm * denom, data->timing.cm_solver_pre_comp * denom, data->timing.cm_solver_pre_app * denom, + data->timing.cm_solver_comm * denom, + data->timing.cm_solver_allreduce * denom, data->timing.cm_solver_spmv * denom, data->timing.cm_solver_vector_ops * denom, data->timing.cm_solver_orthog * denom, @@ -1295,9 +1296,10 @@ void Output_Results( reax_system *system, control_params *control, data->timing.nonb = 0; data->timing.cm = ZERO; data->timing.cm_sort_mat_rows = ZERO; - data->timing.cm_solver_comm = ZERO; data->timing.cm_solver_pre_comp = ZERO; data->timing.cm_solver_pre_app = ZERO; + data->timing.cm_solver_comm = ZERO; + data->timing.cm_solver_allreduce = ZERO; data->timing.cm_solver_iters = 0; data->timing.cm_solver_spmv = ZERO; data->timing.cm_solver_vector_ops = ZERO; diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c index 34b3f8e9b38157abd5797d9b33beace90a6df492..2d09f7daf4f570b4d52e7641cabc558dc3361d92 100644 --- a/PuReMD/src/linear_solvers.c +++ b/PuReMD/src/linear_solvers.c @@ -387,7 +387,7 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm ); t_comm += Get_Timing_Info( t_start ); - // int nnz = 0; uncomment to check the nnz's in the sparsity pattern + //int nnz = 0; //uncomment to check the nnz's in the sparsity pattern /* build entries of that pattern*/ for ( i = 0; i < A->n; ++i ) @@ -509,6 +509,8 @@ real sparse_approx_inverse(reax_system *system, simulation_data *data, storage * comm = mpi_data->comm_mesh3D; out_bufs = mpi_data->out_buffers; + //fprintf(stderr, "Before dist call, p%d\n", system->my_rank ); + /* use a Dist-like approach to send the row information */ for ( d = 0; d < 3; ++d) { @@ -654,8 +656,8 @@ real sparse_approx_inverse(reax_system *system, simulation_data *data, storage * } t_start = Get_Time( ); - MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm ); - MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm ); + MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, comm ); + MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, comm ); t_comm += Get_Timing_Info( t_start ); } @@ -711,6 +713,9 @@ real sparse_approx_inverse(reax_system *system, simulation_data *data, storage * } } } + + //fprintf(stderr, "After dist call, p%d\n", system->my_rank ); + //fflush(stderr); X = (int *) malloc( sizeof(int) * (system->bigN + 1) ); pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) ); @@ -1188,13 +1193,14 @@ int CG( reax_system *system, control_params *control, simulation_data *data, int i, j, scale; real tmp, alpha, beta, b_norm; real sig_old, sig_new; - real t_start, t_pa, t_spmv, t_vops, t_comm; - real total_pa, total_spmv, total_vops, total_comm; + real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; + real total_pa, total_spmv, total_vops, total_comm, total_allreduce; t_pa = 0.0; t_spmv = 0.0; t_vops = 0.0; t_comm = 0.0; + t_allreduce = 0.0; t_start = Get_Time( ); scale = sizeof(real) / sizeof(void); @@ -1260,6 +1266,10 @@ int CG( reax_system *system, control_params *control, simulation_data *data, t_start = Get_Time( ); tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world); + //TODO: all_Reduce time + t_allreduce += Get_Timing_Info ( t_start ); + + t_start = Get_Time( ); alpha = sig_new / tmp; Vector_Add( x, alpha, workspace->d, system->n ); Vector_Add( workspace->r, -alpha, workspace->q, system->n ); @@ -1290,6 +1300,10 @@ int CG( reax_system *system, control_params *control, simulation_data *data, t_start = Get_Time( ); sig_old = sig_new; sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world); + //TODO all_reduce time + t_allreduce += Get_Timing_Info( t_start ); + + t_start = Get_Time( ); beta = sig_new / sig_old; Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); t_vops += Get_Timing_Info( t_start ); @@ -1299,6 +1313,7 @@ int CG( reax_system *system, control_params *control, simulation_data *data, MPI_Reduce(&t_spmv, &total_spmv, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); MPI_Reduce(&t_vops, &total_vops, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); MPI_Reduce(&t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); + MPI_Reduce(&t_allreduce, &total_allreduce, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world); if( system->my_rank == MASTER_NODE ) { @@ -1306,6 +1321,7 @@ int CG( reax_system *system, control_params *control, simulation_data *data, data->timing.cm_solver_spmv += total_spmv / nprocs; data->timing.cm_solver_vector_ops += total_vops / nprocs; data->timing.cm_solver_comm += total_comm / nprocs; + data->timing.cm_solver_allreduce += total_allreduce / nprocs; } MPI_Barrier(mpi_data->world); diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h index f9933ffe9b3181c2555dd6708af2614a1608e506..56a8c1c9841d0f6ea67f74d364277ff9a768f5d3 100644 --- a/PuReMD/src/reax_types.h +++ b/PuReMD/src/reax_types.h @@ -766,6 +766,8 @@ typedef struct /**/ real cm_solver_comm; /**/ + real cm_solver_allreduce; + /**/ real cm_solver_pre_comp; /**/ real cm_solver_pre_app; // update CG()