diff --git a/PuReMD/src/.linear_solvers.c.swo b/PuReMD/src/.linear_solvers.c.swo new file mode 100644 index 0000000000000000000000000000000000000000..7c35c4cfa5c0de52efcb099600e66d1da9c475bb Binary files /dev/null and b/PuReMD/src/.linear_solvers.c.swo differ diff --git a/PuReMD/src/allocate.c b/PuReMD/src/allocate.c index 32b68ec1fadf394c57159f3ed1cb61265754b189..b2c9aa5b14e096cca696033083de3d1f804d57ab 100644 --- a/PuReMD/src/allocate.c +++ b/PuReMD/src/allocate.c @@ -202,7 +202,8 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->w, "w" ); } - if ( control->cm_solver_type == CG_S ) + if ( control->cm_solver_type == CG_S + || control->cm_solver_type == PIPECG_S ) { sfree( workspace->r2, "r2" ); sfree( workspace->d2, "d2" ); @@ -210,6 +211,15 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->p2, "p2" ); } + if ( control->cm_solver_type == PIPECG_S ) + { + sfree( workspace->m2, "m2" ); + sfree( workspace->n2, "n2" ); + sfree( workspace->u2, "u2" ); + sfree( workspace->w2, "w2" ); + sfree( workspace->w2, "z2" ); + } + /* integrator */ // sfree( workspace->f_old ); sfree( workspace->v_const, "v_const" ); @@ -363,7 +373,8 @@ int Allocate_Workspace( reax_system *system, control_params *control, workspace->w = scalloc( total_cap, sizeof(real), "w", comm ); } - if ( control->cm_solver_type == CG_S ) + if ( control->cm_solver_type == CG_S + || control->cm_solver_type == PIPECG_S ) { workspace->d2 = scalloc( total_cap, sizeof(rvec2), "d2", comm ); workspace->r2 = scalloc( total_cap, sizeof(rvec2), "r2", comm ); @@ -371,6 +382,15 @@ int Allocate_Workspace( reax_system *system, control_params *control, workspace->q2 = scalloc( total_cap, sizeof(rvec2), "q2", comm ); } + if ( control->cm_solver_type == PIPECG_S ) + { + workspace->m2 = scalloc( total_cap, sizeof(rvec2), "m2", comm ); + workspace->n2 = scalloc( total_cap, sizeof(rvec2), "n2", comm ); + workspace->u2 = scalloc( total_cap, sizeof(rvec2), "u2", comm ); + workspace->w2 = scalloc( total_cap, sizeof(rvec2), "w2", comm ); + workspace->z2 = scalloc( total_cap, sizeof(rvec2), "z2", comm ); + } + /* integrator storage */ workspace->v_const = smalloc( local_rvec, "v_const", comm ); diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c index 0fcb83e68abef650acfae13b84c76322c522f430..939b702bf4ff6f716de299fe5f0543cd9a911a72 100644 --- a/PuReMD/src/linear_solvers.c +++ b/PuReMD/src/linear_solvers.c @@ -423,8 +423,11 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st "setup_sparse_approx_inverse::bucketlist_local", MPI_COMM_WORLD ); dspls = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::dspls", MPI_COMM_WORLD ); - pivotlist = smalloc( sizeof(real) * (nprocs - 1), - "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD ); + if ( nprocs > 1 ) + { + pivotlist = smalloc( sizeof(real) * (nprocs - 1), + "setup_sparse_approx_inverse::pivotlist", MPI_COMM_WORLD ); + } samplelist_local = smalloc( sizeof(real) * s_local, "setup_sparse_approx_inverse::samplelist_local", MPI_COMM_WORLD ); if ( system->my_rank == MASTER_NODE ) @@ -690,7 +693,10 @@ real setup_sparse_approx_inverse( reax_system *system, simulation_data *data, st sfree( dspls_local, "setup_sparse_approx_inverse::displs_local" ); sfree( bucketlist_local, "setup_sparse_approx_inverse::bucketlist_local" ); sfree( dspls, "setup_sparse_approx_inverse::dspls" ); - sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" ); + if ( nprocs > 1) + { + sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" ); + } sfree( samplelist_local, "setup_sparse_approx_inverse::samplelist_local" ); if ( system->my_rank == MASTER_NODE ) { @@ -1745,29 +1751,6 @@ int dual_CG( reax_system *system, control_params *control, simulation_data *data b_norm[0] = sqrt( redux[4] ); b_norm[1] = sqrt( redux[5] ); - /*// norm of b - my_sum[0] = my_sum[1] = 0; - for ( j = 0; j < n; ++j ) - { - my_sum[0] += SQR( b[j][0] ); - my_sum[1] += SQR( b[j][1] ); - } - - MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); - - b_norm[0] = sqrt( norm_sqr[0] ); - b_norm[1] = sqrt( norm_sqr[1] ); - - // dot product: r.d - my_dot[0] = my_dot[1] = 0; - for ( j = 0; j < n; ++j ) - { - my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0]; - my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1]; - } - - MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );*/ - for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol ) @@ -1915,7 +1898,7 @@ int dual_CG( reax_system *system, control_params *control, simulation_data *data MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); } - // continue to solve the system that is not converged yet + // continue to solve the system that has not converged yet if ( norm[0] / b_norm[0] > tol ) { for ( j = 0; j < system->n; ++j ) @@ -2157,6 +2140,446 @@ int CG( reax_system *system, control_params *control, simulation_data *data, } +/* Pipelined Preconditioned Conjugate Gradient Method + * + * References: + * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, + * P. Ghysels and W. Vanroose, Parallel Computing, 2014. + * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods, + * Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference + * for High Performance Computing, Networking, Storage and Analysis, 2016. + * */ +int dual_PIPECG( reax_system *system, control_params *control, simulation_data *data, + storage *workspace, sparse_matrix *H, rvec2 *b, + real tol, rvec2 *x, mpi_datatypes* mpi_data ) +{ + int i, j; + rvec2 alpha, beta, delta, gamma_old, gamma_new, norm, b_norm; + real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce; + real timings[5], redux[8]; + MPI_Request req; + + t_pa = 0.0; + t_spmv = 0.0; + t_vops = 0.0; + t_comm = 0.0; + t_allreduce = 0.0; + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, x, workspace->u2, H->NT ); +#else + dual_Sparse_MatVec( H, x, workspace->u2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + t_start = MPI_Wtime( ); + //Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n ); + for ( j = 0; j < system->n; ++j ) + { + workspace->r2[j][0] = b[j][0] - workspace->u2[j][0]; + workspace->r2[j][1] = b[j][1] - workspace->u2[j][1]; + } + t_vops += MPI_Wtime( ) - t_start; + + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + //Vector_Copy( workspace->u, workspace->r, system->n ); + for ( j = 0; j < system->n ; ++j ) + { + workspace->u2[j][0] = workspace->r2[j][0]; + workspace->u2[j][1] = workspace->r2[j][1]; + } + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) + { + workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, H->NT ); +#else + dual_Sparse_MatVec( workspace->H_app_inv, workspace->r2, workspace->u2, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, workspace->u2, workspace->w2, H->NT ); +#else + dual_Sparse_MatVec( H, workspace->u2, workspace->w2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + t_start = MPI_Wtime( ); + //redux[0] = Dot_local( workspace->w, workspace->u, system->n ); + //redux[1] = Dot_local( workspace->r, workspace->u, system->n ); + //redux[2] = Dot_local( workspace->u, workspace->u, system->n ); + //redux[3] = Dot_local( b, b, system->n ); + for ( j = 0; j < 8; ++j ) + { + redux[j] = 0.0; + } + for( j = 0; j < system->n; ++j ) + { + redux[0] += workspace->w2[j][0] * workspace->u2[j][0]; + redux[1] += workspace->w2[j][1] * workspace->u2[j][1]; + + redux[2] += workspace->r2[j][0] * workspace->u2[j][0]; + redux[3] += workspace->r2[j][1] * workspace->u2[j][1]; + + redux[4] += workspace->u2[j][0] * workspace->u2[j][0]; + redux[5] += workspace->u2[j][1] * workspace->u2[j][1]; + + redux[6] += b[j][0] * b[j][0]; + redux[7] += b[j][1] * b[j][1]; + } + t_vops += MPI_Wtime( ) - t_start; + + MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req ); + + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + //Vector_Copy( workspace->m, workspace->w, system->n ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0]; + workspace->m2[j][1] = workspace->w2[j][1]; + } + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j]; + workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT ); +#else + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT ); +#else + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + t_start = MPI_Wtime( ); + MPI_Wait( &req, MPI_STATUS_IGNORE ); + t_allreduce += MPI_Wtime( ) - t_start; + delta[0] = redux[0]; + delta[1] = redux[1]; + gamma_new[0] = redux[2]; + gamma_new[1] = redux[3]; + norm[0] = sqrt( redux[4] ); + norm[1] = sqrt( redux[5] ); + b_norm[0] = sqrt( redux[6] ); + b_norm[1] = sqrt( redux[7] ); + + for ( i = 0; i < control->cm_solver_max_iters; ++i ) + { + if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol ) + { + break; + } + if ( i > 0 ) + { + beta[0] = gamma_new[0] / gamma_old[0]; + beta[1] = gamma_new[1] / gamma_old[1]; + alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]); + alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]); + } + else + { + beta[0] = 0.0; + beta[1] = 0.0; + alpha[0] = gamma_new[0] / delta[0]; + alpha[1] = gamma_new[1] / delta[1]; + } + + t_start = MPI_Wtime( ); + //Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n ); + //Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n ); + //Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n ); + //Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n ); + //Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n ); + //Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n ); + //Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n ); + //Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n ); + //redux[0] = Dot_local( workspace->w, workspace->u, system->n ); + //redux[1] = Dot_local( workspace->r, workspace->u, system->n ); + //redux[2] = Dot_local( workspace->u, workspace->u, system->n ); + for ( j = 0; j < 6; ++j ) + { + redux[j] = 0.0; + } + for ( j = 0; j < system->n; ++j ) + { + workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0]; + workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1]; + + workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0]; + workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1]; + + workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0]; + workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1]; + + workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0]; + workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1]; + + x[j][0] += alpha[0] * workspace->p2[j][0]; + x[j][1] += alpha[1] * workspace->p2[j][1]; + + workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0]; + workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1]; + + workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0]; + workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1]; + + workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0]; + workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1]; + + redux[0] += workspace->w2[j][0] * workspace->u2[j][0]; + redux[1] += workspace->w2[j][1] * workspace->u2[j][1]; + + redux[2] += workspace->r2[j][0] * workspace->u2[j][0]; + redux[3] += workspace->r2[j][1] * workspace->u2[j][1]; + + redux[4] += workspace->u2[j][0] * workspace->u2[j][0]; + redux[5] += workspace->u2[j][1] * workspace->u2[j][1]; + + } + t_vops += MPI_Wtime( ) - t_start; + + MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req ); + + /* pre-conditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + //Vector_Copy( workspace->m, workspace->w, system->n ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0]; + workspace->m2[j][1] = workspace->w2[j][1]; + } + } + else if ( control->cm_solver_pre_comp_type == JACOBI_PC ) + { + t_start = MPI_Wtime( ); + for ( j = 0; j < system->n; ++j ) + { + workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j]; + workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j]; + } + t_pa += MPI_Wtime( ) - t_start; + } + else if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, H->NT ); +#else + dual_Sparse_MatVec( workspace->H_app_inv, workspace->w2, workspace->m2, system->n ); +#endif + t_pa += MPI_Wtime( ) - t_start; + } + + t_start = MPI_Wtime( ); + Dist( system, mpi_data, workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2); + t_comm += MPI_Wtime( ) - t_start; + + t_start = MPI_Wtime( ); +#if defined(NEUTRAL_TERRITORY) + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, H->NT ); +#else + dual_Sparse_MatVec( H, workspace->m2, workspace->n2, system->N ); +#endif + t_spmv += MPI_Wtime( ) - t_start; + + if ( H->format == SYM_HALF_MATRIX ) + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); + t_comm += MPI_Wtime( ) - t_start; + } +#if defined(NEUTRAL_TERRITORY) + else + { + t_start = MPI_Wtime( ); + Coll( system, mpi_data, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2); + t_comm += MPI_Wtime( ) - t_start; + } +#endif + + gamma_old[0] = gamma_new[0]; + gamma_old[1] = gamma_new[1]; + + t_start = MPI_Wtime( ); + MPI_Wait( &req, MPI_STATUS_IGNORE ); + t_allreduce += MPI_Wtime( ) - t_start; + delta[0] = redux[0]; + delta[1] = redux[1]; + gamma_new[0] = redux[2]; + gamma_new[1] = redux[3]; + norm[0] = sqrt( redux[4] ); + norm[1] = sqrt( redux[5] ); + } + + timings[0] = t_pa; + timings[1] = t_spmv; + timings[2] = t_vops; + timings[3] = t_comm; + timings[4] = t_allreduce; + + if ( system->my_rank == MASTER_NODE ) + { + MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); + + data->timing.cm_solver_pre_app += timings[0] / control->nprocs; + data->timing.cm_solver_spmv += timings[1] / control->nprocs; + data->timing.cm_solver_vector_ops += timings[2] / control->nprocs; + data->timing.cm_solver_comm += timings[3] / control->nprocs; + data->timing.cm_solver_allreduce += timings[4] / control->nprocs; + } + else + { + MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world ); + } + + // continue to solve the system that has not converged yet + if ( norm[0] / b_norm[0] > tol ) + { + for ( j = 0; j < system->n; ++j ) + { + workspace->s[j] = workspace->x[j][0]; + } + + i += PIPECG( system, control, data, workspace, + H, workspace->b_s, tol, workspace->s, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][0] = workspace->s[j]; + } + } + else if ( norm[1] / b_norm[1] > tol ) + { + for ( j = 0; j < system->n; ++j ) + { + workspace->t[j] = workspace->x[j][1]; + } + + i += PIPECG( system, control, data, workspace, + H, workspace->b_t, tol, workspace->t, mpi_data ); + + for ( j = 0; j < system->n; ++j ) + { + workspace->x[j][1] = workspace->t[j]; + } + } + + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" ); + return i; + } + + return i; +} + + /* Pipelined Preconditioned Conjugate Gradient Method * * References: @@ -2647,7 +3070,6 @@ int PIPECR( reax_system *system, control_params *control, simulation_data *data, t_comm += MPI_Wtime( ) - t_start; } #endif - t_start = MPI_Wtime( ); MPI_Wait( &req, MPI_STATUS_IGNORE ); t_allreduce += MPI_Wtime( ) - t_start; diff --git a/PuReMD/src/linear_solvers.h b/PuReMD/src/linear_solvers.h index 42892a8f1a199bc53a75a39e433ff3c0ff4138c6..701ee82d695150b7f4a8660ee843cf46cc6fa873 100644 --- a/PuReMD/src/linear_solvers.h +++ b/PuReMD/src/linear_solvers.h @@ -37,6 +37,9 @@ int dual_CG( reax_system*, control_params*, simulation_data*, storage*, sparse_m int CG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, real*, real, real*, mpi_datatypes* ); +int dual_PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, + rvec2*, real, rvec2*, mpi_datatypes* ); + int PIPECG( reax_system*, control_params*, simulation_data*, storage*, sparse_matrix*, real*, real, real*, mpi_datatypes* ); diff --git a/PuReMD/src/qEq.c b/PuReMD/src/qEq.c index 96336c6693c97b740eadf59072340ddc0c595aa3..3ba18eeb8bb78533f4b894d7b28ff8b29b86f0a1 100644 --- a/PuReMD/src/qEq.c +++ b/PuReMD/src/qEq.c @@ -445,7 +445,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, switch ( control->cm_solver_type ) { case CG_S: -#if defined(DUAL_CG) +#if defined(DUAL_SOLVER) iters = dual_CG( system, control, data, workspace, workspace->H, workspace->b, control->cm_solver_q_err, workspace->x, mpi_data ); #else @@ -478,6 +478,10 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, break; case PIPECG_S: +#if defined(DUAL_SOLVER) + iters = dual_PIPECG( system, control, data, workspace, workspace->H, workspace->b, + control->cm_solver_q_err, workspace->x, mpi_data ); +#else for ( j = 0; j < system->n; ++j ) { workspace->s[j] = workspace->x[j][0]; @@ -503,6 +507,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, { workspace->x[j][1] = workspace->t[j]; } +#endif break; case PIPECR_S: diff --git a/PuReMD/src/reax_types.h b/PuReMD/src/reax_types.h index 224b78c767fdfaa46df12259f15f0f7e2632148e..c55d413ff8d8693e0f0c31641932b9c6ae0641e8 100644 --- a/PuReMD/src/reax_types.h +++ b/PuReMD/src/reax_types.h @@ -40,8 +40,8 @@ /************* SOME DEFS - crucial for reax_types.h *********/ #define PURE_REAX -//#define DUAL_CG -//#define NEUTRAL_TERRITORY +#define DUAL_SOLVER +#define NEUTRAL_TERRITORY //#define LAMMPS_REAX //#define DEBUG //#define DEBUG_FOCUS @@ -1059,7 +1059,9 @@ typedef struct /* PIPECG, PIPECR storage */ real *m, *n, *u, *w; /* dual-CG storage */ - rvec2 *r2, *d2, *q2, *p2; + rvec2 *d2, *p2, *q2, *r2; + /* dual-PIPECG storage */ + rvec2 *m2, *n2, *u2, *w2, *z2; /* Taper */ real Tap[8];