diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c index 9a02f53853d6d3bc149f47df5625b7abb6c595ab..74e6b9394c949ee1e5d4cb54e74c480b5abe1993 100644 --- a/PG-PuReMD/src/allocate.c +++ b/PG-PuReMD/src/allocate.c @@ -206,10 +206,12 @@ static void Deallocate_Workspace_Part2( control_params * const control, sfree( workspace->d, "Deallocate_Workspace_Part2::workspace->d" ); sfree( workspace->q, "Deallocate_Workspace_Part2::workspace->q" ); sfree( workspace->p, "Deallocate_Workspace_Part2::workspace->p" ); +#if defined(DUAL_SOLVER) sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" ); sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" ); sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" ); sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" ); +#endif break; case SDM_S: @@ -217,10 +219,12 @@ static void Deallocate_Workspace_Part2( control_params * const control, sfree( workspace->d, "Deallocate_Workspace_Part2::workspace->d" ); sfree( workspace->q, "Deallocate_Workspace_Part2::workspace->q" ); sfree( workspace->p, "Deallocate_Workspace_Part2::workspace->p" ); +#if defined(DUAL_SOLVER) sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" ); sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" ); sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" ); sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" ); +#endif break; case BiCGStab_S: @@ -233,6 +237,17 @@ static void Deallocate_Workspace_Part2( control_params * const control, sfree( workspace->p, "Deallocate_Workspace_Part2::workspace->p" ); sfree( workspace->r_hat, "Deallocate_Workspace_Part2::workspace->r_hat" ); sfree( workspace->q_hat, "Deallocate_Workspace_Part2::workspace->q_hat" ); +#if defined(DUAL_SOLVER) + sfree( workspace->y2, "Deallocate_Workspace_Part2::workspace->y2" ); + sfree( workspace->g2, "Deallocate_Workspace_Part2::workspace->g2" ); + sfree( workspace->z2, "Deallocate_Workspace_Part2::workspace->z2" ); + sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" ); + sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" ); + sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" ); + sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" ); + sfree( workspace->r_hat2, "Deallocate_Workspace_Part2::workspace->r_hat2" ); + sfree( workspace->q_hat2, "Deallocate_Workspace_Part2::workspace->q_hat2" ); +#endif break; case PIPECG_S: @@ -245,6 +260,7 @@ static void Deallocate_Workspace_Part2( control_params * const control, sfree( workspace->n, "Deallocate_Workspace_Part2::workspace->n" ); sfree( workspace->u, "Deallocate_Workspace_Part2::workspace->u" ); sfree( workspace->w, "Deallocate_Workspace_Part2::workspace->w" ); +#if defined(DUAL_SOLVER) sfree( workspace->z2, "Deallocate_Workspace_Part2::workspace->z2" ); sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" ); sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" ); @@ -254,6 +270,7 @@ static void Deallocate_Workspace_Part2( control_params * const control, sfree( workspace->n2, "Deallocate_Workspace_Part2::workspace->n2" ); sfree( workspace->u2, "Deallocate_Workspace_Part2::workspace->u2" ); sfree( workspace->w2, "Deallocate_Workspace_Part2::workspace->w2" ); +#endif break; case PIPECR_S: @@ -266,6 +283,17 @@ static void Deallocate_Workspace_Part2( control_params * const control, sfree( workspace->n, "Deallocate_Workspace_Part2::workspace->n" ); sfree( workspace->u, "Deallocate_Workspace_Part2::workspace->u" ); sfree( workspace->w, "Deallocate_Workspace_Part2::workspace->w" ); +#if defined(DUAL_SOLVER) + sfree( workspace->z2, "Deallocate_Workspace_Part2::workspace->z2" ); + sfree( workspace->r2, "Deallocate_Workspace_Part2::workspace->r2" ); + sfree( workspace->d2, "Deallocate_Workspace_Part2::workspace->d2" ); + sfree( workspace->q2, "Deallocate_Workspace_Part2::workspace->q2" ); + sfree( workspace->p2, "Deallocate_Workspace_Part2::workspace->p2" ); + sfree( workspace->m2, "Deallocate_Workspace_Part2::workspace->m2" ); + sfree( workspace->n2, "Deallocate_Workspace_Part2::workspace->n2" ); + sfree( workspace->u2, "Deallocate_Workspace_Part2::workspace->u2" ); + sfree( workspace->w2, "Deallocate_Workspace_Part2::workspace->w2" ); +#endif break; default: @@ -433,10 +461,12 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons workspace->d = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::d" ); workspace->q = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::q" ); workspace->p = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::p" ); +#if defined(DUAL_SOLVER) workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" ); workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" ); workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" ); workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" ); +#endif break; case SDM_S: @@ -444,10 +474,12 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons workspace->d = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::d" ); workspace->q = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::q" ); workspace->p = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::p" ); +#if defined(DUAL_SOLVER) workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" ); workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" ); workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" ); workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" ); +#endif break; case BiCGStab_S: @@ -460,6 +492,17 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons workspace->p = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::p" ); workspace->r_hat = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::r_hat" ); workspace->q_hat = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::q_hat" ); +#if defined(DUAL_SOLVER) + workspace->y2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::y2" ); + workspace->g2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::g2" ); + workspace->z2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::z2" ); + workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" ); + workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" ); + workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" ); + workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" ); + workspace->r_hat2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r_hat2" ); + workspace->q_hat2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q_hat2" ); +#endif break; case PIPECG_S: @@ -472,6 +515,7 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons workspace->n = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::n" ); workspace->u = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::u" ); workspace->w = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::w" ); +#if defined(DUAL_SOLVER) workspace->z2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::z2" ); workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" ); workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" ); @@ -481,6 +525,7 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons workspace->n2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::n2" ); workspace->u2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::u2" ); workspace->w2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::w2" ); +#endif break; case PIPECR_S: @@ -493,6 +538,17 @@ void Allocate_Workspace_Part2( reax_system * const system, control_params * cons workspace->n = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::n" ); workspace->u = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::u" ); workspace->w = scalloc( total_cap, sizeof(real), "Allocate_Workspace_Part2::w" ); +#if defined(DUAL_SOLVER) + workspace->z2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::z2" ); + workspace->r2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::r2" ); + workspace->d2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::d2" ); + workspace->q2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::q2" ); + workspace->p2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::p2" ); + workspace->m2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::m2" ); + workspace->n2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::n2" ); + workspace->u2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::u2" ); + workspace->w2 = scalloc( total_cap, sizeof(rvec2), "Allocate_Workspace_Part2::w2" ); +#endif break; default: diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c index ed164cab6bc1c7ea42048b4b64200328af180429..a0b1c0252619195ec94c28fcd314671ca9b0c1c4 100644 --- a/PG-PuReMD/src/charges.c +++ b/PG-PuReMD/src/charges.c @@ -468,10 +468,8 @@ static void QEq( reax_system const * const system, control_params const * const case SDM_S: #if defined(DUAL_SOLVER) - fprintf( stderr, "[ERROR] Dual SDM solver for QEq not yet implemented. Terminating...\n" ); - exit( INVALID_INPUT ); -// iters = dual_SDM( system, control, data, workspace, &workspace->H, workspace->b, -// control->cm_solver_q_err, workspace->x, mpi_data, refactor ); + iters = dual_SDM( system, control, data, workspace, &workspace->H, workspace->b, + control->cm_solver_q_err, workspace->x, mpi_data, refactor ); #else iters = SDM( system, control, data, workspace, &workspace->H, workspace->b_s, control->cm_solver_q_err, workspace->s, mpi_data, refactor ); @@ -483,10 +481,8 @@ static void QEq( reax_system const * const system, control_params const * const case BiCGStab_S: #if defined(DUAL_SOLVER) - fprintf( stderr, "[ERROR] Dual BiCGStab solver for QEq not yet implemented. Terminating...\n" ); - exit( INVALID_INPUT ); -// iters = dual_BiCGStab( system, control, data, workspace, &workspace->H, workspace->b, -// control->cm_solver_q_err, workspace->x, mpi_data, refactor ); + iters = dual_BiCGStab( system, control, data, workspace, &workspace->H, workspace->b, + control->cm_solver_q_err, workspace->x, mpi_data, refactor ); #else iters = BiCGStab( system, control, data, workspace, &workspace->H, workspace->b_s, control->cm_solver_q_err, workspace->s, mpi_data, refactor ); @@ -511,10 +507,8 @@ static void QEq( reax_system const * const system, control_params const * const case PIPECR_S: #if defined(DUAL_SOLVER) - fprintf( stderr, "[ERROR] Dual PIPECR solver for QEq not yet implemented. Terminating...\n" ); - exit( INVALID_INPUT ); -// iters = dual_PIPECR( system, control, data, workspace, &workspace->H, workspace->b, -// control->cm_solver_q_err, workspace->x, mpi_data, refactor ); + iters = dual_PIPECR( system, control, data, workspace, &workspace->H, workspace->b, + control->cm_solver_q_err, workspace->x, mpi_data, refactor ); #else iters = PIPECR( system, control, data, workspace, &workspace->H, workspace->b_s, control->cm_solver_q_err, workspace->s, mpi_data, refactor ); diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c index 1fa0671107bbc426c692778b66c1f7e8aa3dbe70..fd7bd3d88e7fa0dc363e62bb0dc9cf830eb4f82f 100644 --- a/PG-PuReMD/src/init_md.c +++ b/PG-PuReMD/src/init_md.c @@ -881,10 +881,12 @@ static void Finalize_Workspace( reax_system * const system, control_params * con sfree( workspace->d, "Finalize_Workspace::workspace->d" ); sfree( workspace->q, "Finalize_Workspace::workspace->q" ); sfree( workspace->p, "Finalize_Workspace::workspace->p" ); +#if defined(DUAL_SOLVER) sfree( workspace->r2, "Finalize_Workspace::workspace->r2" ); sfree( workspace->d2, "Finalize_Workspace::workspace->d2" ); sfree( workspace->q2, "Finalize_Workspace::workspace->q2" ); sfree( workspace->p2, "Finalize_Workspace::workspace->p2" ); +#endif break; case SDM_S: @@ -892,10 +894,12 @@ static void Finalize_Workspace( reax_system * const system, control_params * con sfree( workspace->d, "Finalize_Workspace::workspace->d" ); sfree( workspace->q, "Finalize_Workspace::workspace->q" ); sfree( workspace->p, "Finalize_Workspace::workspace->p" ); +#if defined(DUAL_SOLVER) sfree( workspace->r2, "Finalize_Workspace::workspace->r2" ); sfree( workspace->d2, "Finalize_Workspace::workspace->d2" ); sfree( workspace->q2, "Finalize_Workspace::workspace->q2" ); sfree( workspace->p2, "Finalize_Workspace::workspace->p2" ); +#endif break; case BiCGStab_S: @@ -908,6 +912,17 @@ static void Finalize_Workspace( reax_system * const system, control_params * con sfree( workspace->p, "Finalize_Workspace::workspace->p" ); sfree( workspace->r_hat, "Finalize_Workspace::workspace->r_hat" ); sfree( workspace->q_hat, "Finalize_Workspace::workspace->q_hat" ); +#if defined(DUAL_SOLVER) + sfree( workspace->y2, "Finalize_Workspace::workspace->y2" ); + sfree( workspace->g2, "Finalize_Workspace::workspace->g2" ); + sfree( workspace->z2, "Finalize_Workspace::workspace->z2" ); + sfree( workspace->r2, "Finalize_Workspace::workspace->r2" ); + sfree( workspace->d2, "Finalize_Workspace::workspace->d2" ); + sfree( workspace->q2, "Finalize_Workspace::workspace->q2" ); + sfree( workspace->p2, "Finalize_Workspace::workspace->p2" ); + sfree( workspace->r_hat2, "Finalize_Workspace::workspace->r_hat2" ); + sfree( workspace->q_hat2, "Finalize_Workspace::workspace->q_hat2" ); +#endif break; case PIPECG_S: @@ -920,6 +935,7 @@ static void Finalize_Workspace( reax_system * const system, control_params * con sfree( workspace->n, "Finalize_Workspace::workspace->n" ); sfree( workspace->u, "Finalize_Workspace::workspace->u" ); sfree( workspace->w, "Finalize_Workspace::workspace->w" ); +#if defined(DUAL_SOLVER) sfree( workspace->z2, "Finalize_Workspace::workspace->z2" ); sfree( workspace->r2, "Finalize_Workspace::workspace->r2" ); sfree( workspace->d2, "Finalize_Workspace::workspace->d2" ); @@ -929,6 +945,7 @@ static void Finalize_Workspace( reax_system * const system, control_params * con sfree( workspace->n2, "Finalize_Workspace::workspace->n2" ); sfree( workspace->u2, "Finalize_Workspace::workspace->u2" ); sfree( workspace->w2, "Finalize_Workspace::workspace->w2" ); +#endif break; case PIPECR_S: @@ -941,6 +958,17 @@ static void Finalize_Workspace( reax_system * const system, control_params * con sfree( workspace->n, "Finalize_Workspace::workspace->n" ); sfree( workspace->u, "Finalize_Workspace::workspace->u" ); sfree( workspace->w, "Finalize_Workspace::workspace->w" ); +#if defined(DUAL_SOLVER) + sfree( workspace->z2, "Finalize_Workspace::workspace->z2" ); + sfree( workspace->r2, "Finalize_Workspace::workspace->r2" ); + sfree( workspace->d2, "Finalize_Workspace::workspace->d2" ); + sfree( workspace->q2, "Finalize_Workspace::workspace->q2" ); + sfree( workspace->p2, "Finalize_Workspace::workspace->p2" ); + sfree( workspace->m2, "Finalize_Workspace::workspace->m2" ); + sfree( workspace->n2, "Finalize_Workspace::workspace->n2" ); + sfree( workspace->u2, "Finalize_Workspace::workspace->u2" ); + sfree( workspace->w2, "Finalize_Workspace::workspace->w2" ); +#endif break; default: diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index 2825724cd7f74ba57df5e874958ca0ab29df3548..35b3ef00f4c80c54df81397bf3f61989b1fd83f4 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -2491,6 +2491,154 @@ static void apply_preconditioner( reax_system const * const system, } +/* Steepest Descent + * This function performs dual iteration for QEq (2 simultaneous solves) + * */ +int dual_SDM( reax_system const * const system, control_params const * const control, + simulation_data * const data, storage * const workspace, + sparse_matrix * const H, rvec2 * const b, real tol, + rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) +{ + int i, j, ret; + rvec2 tmp, alpha, bnorm, sig; + real redux[4]; +#if defined(LOG_PERFORMANCE) + real time; +#endif + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, + H->NT, workspace->q2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, + system->N, workspace->q2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, + -1.0, -1.0, workspace->q2, system->n ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, + workspace->q2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, + workspace->d2, fresh_pre, RIGHT ); + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[2], &redux[3] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + bnorm[0] = SQRT( redux[0] ); + bnorm[1] = SQRT( redux[1] ); + sig[0] = redux[2]; + sig[1] = redux[3]; + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + for ( i = 0; i < control->cm_solver_max_iters; ++i ) + { + if ( SQRT(sig[0]) / bnorm[0] <= tol || SQRT(sig[1]) / bnorm[1] <= tol ) + { + break; + } + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, + H->NT, workspace->q2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, + system->N, workspace->q2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace->d2, workspace->q2, system->n, &redux[2], &redux[3] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + sig[0] = redux[0]; + sig[1] = redux[1]; + tmp[0] = redux[2]; + tmp[1] = redux[3]; + alpha[0] = sig[0] / tmp[0]; + alpha[1] = sig[1] / tmp[1]; + Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n ); + Vector_Add_rvec2( workspace->r2, + -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, + workspace->q2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, + workspace->d2, FALSE, RIGHT ); + } + + /* continue to solve the system that has not converged yet */ + if ( sig[0] / bnorm[0] > tol ) + { + Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); + + i += SDM( system, control, data, workspace, + H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); + + Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); + } + else if ( sig[1] / bnorm[1] > tol ) + { + Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); + + i += SDM( system, control, data, workspace, + H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); + + Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); + } + + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", i ); + fprintf( stderr, " [INFO] Rel. residual error (s solve): %f\n", SQRT(sig[0]) / bnorm[0] ); + fprintf( stderr, " [INFO] Rel. residual error (t solve): %f\n", SQRT(sig[1]) / bnorm[1] ); + return i; + } + + return i; +} + + /* Steepest Descent */ int SDM( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, @@ -2605,7 +2753,7 @@ int SDM( reax_system const * const system, control_params const * const control, /* Dual iteration of the Preconditioned Conjugate Gradient Method - * for QEq (2 simaltaneous solves) */ + * for QEq (2 simultaneous solves) */ int dual_CG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, rvec2 * const b, @@ -2854,45 +3002,312 @@ int CG( reax_system const * const system, control_params const * const control, Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif - for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) - { + for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) + { +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, + H->NT, workspace->q ); +#else + Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, + system->N, workspace->q ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->d, system->n ); + Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, + workspace->q, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, + workspace->p, FALSE, RIGHT ); + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + redux[0] = Dot_local( workspace->r, workspace->p, system->n ); + redux[1] = Dot_local( workspace->p, workspace->p, system->n ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + sig_old = sig_new; + sig_new = redux[0]; + r_norm = SQRT( redux[1] ); + beta = sig_new / sig_old; + Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + } + + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i ); + fprintf( stderr, " [INFO] Rel. residual error: %e\n", r_norm / b_norm ); + return i; + } + + return i; +} + + +/* Bi-conjugate gradient stabalized method with left preconditioning for + * solving nonsymmetric linear systems. + * This function performs dual iteration for QEq (2 simultaneous solves) + * + * system: + * workspace: struct containing storage for workspace for the linear solver + * control: struct containing parameters governing the simulation and numeric methods + * data: struct containing simulation data (e.g., atom info) + * H: sparse, symmetric matrix, lower half stored in CSR format + * b: right-hand side of the linear system + * tol: tolerence compared against the relative residual for determining convergence + * x: inital guess + * mpi_data: + * + * Reference: Netlib (in MATLAB) + * http://www.netlib.org/templates/matlab/bicgstab.m + * */ +int dual_BiCGStab( reax_system const * const system, control_params const * const control, + simulation_data * const data, + storage * const workspace, sparse_matrix * const H, rvec2 * const b, + real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) +{ + int i, j, ret; + rvec2 tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm; + real time, redux[4]; + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, + H->NT, workspace->d2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, + system->N, workspace->d2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->d2, system->n ); + Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[2], &redux[3] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + b_norm[0] = SQRT( redux[0] ); + b_norm[1] = SQRT( redux[1] ); + r_norm[0] = SQRT( redux[2] ); + r_norm[1] = SQRT( redux[3] ); + if ( b_norm[0] == 0.0 ) + { + b_norm[0] = 1.0; + } + if ( b_norm[1] == 0.0 ) + { + b_norm[1] = 1.0; + } + Vector_Copy_rvec2( workspace->r_hat2, workspace->r2, system->n ); + omega[0] = 1.0; + omega[1] = 1.0; + rho[0] = 1.0; + rho[1] = 1.0; + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + for ( i = 0; i < control->cm_solver_max_iters; ++i ) + { + if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol ) + { + break; + } + + Dot_local_rvec2( workspace->r_hat2, workspace->r2, system->n, + &redux[0], &redux[1] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + rho[0] = redux[0]; + rho[1] = redux[1]; + if ( rho[0] == 0.0 || rho[1] == 0.0 ) + { + break; + } + if ( i > 0 ) + { + beta[0] = (rho[0] / rho_old[0]) * (alpha[0] / omega[0]); + beta[1] = (rho[1] / rho_old[1]) * (alpha[1] / omega[1]); + Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->p2, + -1.0 * omega[0], -1.0 * omega[1], workspace->z2, system->n ); + Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->r2, + beta[0], beta[1], workspace->q2, system->n ); + } + else + { + Vector_Copy_rvec2( workspace->p2, workspace->r2, system->n ); + } + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->p2, + workspace->y2, i == 0 ? fresh_pre : FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2, + workspace->d2, i == 0 ? fresh_pre : FALSE, RIGHT ); + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, + H->NT, workspace->z2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, + system->N, workspace->z2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Dot_local_rvec2( workspace->r_hat2, workspace->z2, system->n, + &redux[0], &redux[1] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + tmp[0] = redux[0]; + tmp[1] = redux[1]; + alpha[0] = rho[0] / tmp[0]; + alpha[1] = rho[1] / tmp[1]; + Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->r2, + -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n ); + Dot_local_rvec2( workspace->q2, workspace->q2, system->n, + &redux[0], &redux[1] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + tmp[0] = redux[0]; + tmp[1] = redux[1]; + /* early convergence check */ + if ( tmp[0] < tol || tmp[1] < tol ) + { + Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n ); + break; + } + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, + workspace->y2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2, + workspace->q_hat2, FALSE, RIGHT ); + #if defined(NEUTRAL_TERRITORY) - Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, - H->NT, workspace->q ); + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2, + H->NT, workspace->y2 ); #else - Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, - system->N, workspace->q ); + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2, + system->N, workspace->y2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif - tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD ); - -#if defined(LOG_PERFORMANCE) - Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); -#endif - - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->d, system->n ); - Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n ); + Dot_local_rvec2( workspace->y2, workspace->q2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace->y2, workspace->y2, system->n, &redux[2], &redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, - workspace->q, FALSE, LEFT ); - apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, - workspace->p, FALSE, RIGHT ); + ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) - time = Get_Time( ); + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif - redux[0] = Dot_local( workspace->r, workspace->p, system->n ); - redux[1] = Dot_local( workspace->p, workspace->p, system->n ); + sigma[0] = redux[0]; + sigma[1] = redux[1]; + tmp[0] = redux[2]; + tmp[1] = redux[3]; + omega[0] = sigma[0] / tmp[0]; + omega[1] = sigma[1] / tmp[1]; + Vector_Sum_rvec2( workspace->g2, alpha[0], alpha[1], workspace->d2, + omega[0], omega[1], workspace->q_hat2, system->n ); + Vector_Add_rvec2( x, 1.0, 1.0, workspace->g2, system->n ); + Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->q2, + -1.0 * omega[0], -1.0 * omega[1], workspace->y2, system->n ); + Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[0], &redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); @@ -2906,22 +3321,58 @@ int CG( reax_system const * const system, control_params const * const control, Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif - sig_old = sig_new; - sig_new = redux[0]; - r_norm = SQRT( redux[1] ); - beta = sig_new / sig_old; - Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n ); + r_norm[0] = SQRT( redux[0] ); + r_norm[1] = SQRT( redux[1] ); + if ( omega[0] == 0.0 || omega[1] == 0.0 ) + { + break; + } + rho_old[0] = rho[0]; + rho_old[1] = rho[1]; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } - if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) + if ( (omega[0] == 0.0 || omega[1] == 0.0) && system->my_rank == MASTER_NODE ) { - fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i ); - fprintf( stderr, " [INFO] Rel. residual error: %e\n", r_norm / b_norm ); - return i; + fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i ); + fprintf( stderr, " [INFO] omega = %e\n", omega ); + } + else if ( (rho[0] == 0.0 || rho[1] == 0.0) && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i ); + fprintf( stderr, " [INFO] rho = %e\n", rho ); + } + + /* continue to solve the system that has not converged yet */ + if ( r_norm[0] / b_norm[0] > tol ) + { + Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); + + i += BiCGStab( system, control, data, workspace, + H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); + + Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); + } + else if ( r_norm[1] / b_norm[1] > tol ) + { + Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); + + i += BiCGStab( system, control, data, workspace, + H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); + + Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); + } + + + if ( i >= control->cm_solver_max_iters + && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i ); + fprintf( stderr, " [INFO] Rel. residual error (s solve): %e\n", r_norm[0] / b_norm[0] ); + fprintf( stderr, " [INFO] Rel. residual error (t solve): %e\n", r_norm[1] / b_norm[1] ); } return i; @@ -3572,6 +4023,209 @@ int PIPECG( reax_system const * const system, control_params const * const contr } +/* Pipelined Preconditioned Conjugate Residual Method. + * This function performs dual iteration for QEq (2 simultaneous solves) + * + * References: + * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, + * P. Ghysels and W. Vanroose, Parallel Computing, 2014. + * */ +int dual_PIPECR( reax_system const * const system, control_params const * const control, + simulation_data * const data, + storage * const workspace, sparse_matrix * const H, rvec2 * const b, + real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) +{ + int i, j, ret; + rvec2 alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm; + real redux[6]; + MPI_Request req; +#if defined(LOG_PERFORMANCE) + real time; +#endif + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, + H->NT, workspace->u2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, + system->N, workspace->u2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->u2, system->n ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, + workspace->n2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, + workspace->u2, fresh_pre, RIGHT ); + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[2], &redux[3] ); + + ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD, &req ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2, + H->NT, workspace->w2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2, + system->N, workspace->w2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + b_norm[0] = SQRT( redux[0] ); + b_norm[1] = SQRT( redux[1] ); + r_norm[0] = SQRT( redux[2] ); + r_norm[1] = SQRT( redux[3] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + for ( i = 0; i < control->cm_solver_max_iters; ++i ) + { + if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol ) + { + break; + } + + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, + workspace->n2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, + workspace->m2, fresh_pre, RIGHT ); + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] ); + Dot_local_rvec2( workspace->m2, workspace->w2, system->n, &redux[2], &redux[3] ); + Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + + ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD, &req ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, + H->NT, workspace->n2 ); +#else + Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, + system->N, workspace->n2 ); +#endif + +#if defined(LOG_PERFORMANCE) + time = Get_Time( ); +#endif + + ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + gamma_new[0] = redux[0]; + gamma_new[1] = redux[1]; + delta[0] = redux[2]; + delta[1] = redux[3]; + r_norm[0] = SQRT( redux[4] ); + r_norm[1] = SQRT( redux[5] ); + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); +#endif + + if ( i > 0 ) + { + beta[0] = gamma_new[0] / gamma_old[0]; + beta[1] = gamma_new[1] / gamma_old[1]; + alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]); + alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]); + } + else + { + beta[0] = 0.0; + beta[1] = 0.0; + alpha[0] = gamma_new[0] / delta[0]; + alpha[1] = gamma_new[1] / delta[1]; + } + + Vector_Sum_rvec2( workspace->z2, 1.0, 1.0, workspace->n2, + beta[0], beta[1], workspace->z2, system->n ); + Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->m2, + beta[0], beta[1], workspace->q2, system->n ); + Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->u2, + beta[0], beta[1], workspace->p2, system->n ); + Vector_Sum_rvec2( workspace->d2, 1.0, 1.0, workspace->w2, + beta[0], beta[1], workspace->d2, system->n ); + Vector_Sum_rvec2( x, 1.0, 1.0, x, alpha[0], alpha[1], workspace->p2, system->n ); + Vector_Sum_rvec2( workspace->u2, 1.0, 1.0, workspace->u2, + -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n ); + Vector_Sum_rvec2( workspace->w2, 1.0, 1.0, workspace->w2, + -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n ); + Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->r2, + -1.0 * alpha[0], -1.0 * alpha[1], workspace->d2, system->n ); + + gamma_old[0] = gamma_new[0]; + gamma_old[1] = gamma_new[1]; + +#if defined(LOG_PERFORMANCE) + Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); +#endif + } + + /* continue to solve the system that has not converged yet */ + if ( r_norm[0] / b_norm[0] > tol ) + { + Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); + + i += PIPECR( system, control, data, workspace, + H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); + + Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); + } + else if ( r_norm[1] / b_norm[1] > tol ) + { + Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); + + i += PIPECR( system, control, data, workspace, + H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); + + Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); + } + + if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) + { + fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" ); + return i; + } + + return i; +} + + /* Pipelined Preconditioned Conjugate Residual Method * * References: diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h index e20a97fac2bf2f6e6b01d65961c05bd01bb3e79d..242a01ada847d0690bb47beba9e4563be9c72923 100644 --- a/PG-PuReMD/src/lin_alg.h +++ b/PG-PuReMD/src/lin_alg.h @@ -44,6 +44,11 @@ real sparse_approx_inverse( reax_system const * const, storage * const, mpi_datatypes * const, sparse_matrix * const, sparse_matrix * const, sparse_matrix * const, int ); +int dual_SDM( reax_system const * const, control_params const * const, + simulation_data * const, + storage * const, sparse_matrix * const, rvec2 * const, + real, rvec2 * const, mpi_datatypes * const, int ); + int SDM( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix * const, real * const, @@ -59,6 +64,11 @@ int CG( reax_system const * const, control_params const * const, storage * const, sparse_matrix * const, real * const, real, real * const, mpi_datatypes * const, int ); +int dual_BiCGStab( reax_system const * const, control_params const * const, + simulation_data * const, + storage * const, sparse_matrix * const, rvec2 * const, + real, rvec2 * const, mpi_datatypes * const, int ); + int BiCGStab( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix * const, real * const, @@ -74,6 +84,11 @@ int PIPECG( reax_system const * const, control_params const * const, storage * const, sparse_matrix * const, real * const, real, real * const, mpi_datatypes * const, int ); +int dual_PIPECR( reax_system const * const, control_params const * const, + simulation_data * const, + storage * const, sparse_matrix * const, rvec2 * const, + real, rvec2 * const, mpi_datatypes * const, int ); + int PIPECR( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix * const, real * const,