Newer
Older
Kurt A. O'Hearn
committed
return i;
}
Kurt A. O'Hearn
committed
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
/* Dual iteration for the Pipelined Preconditioned Conjugate Gradient Method
* for QEq (2 simaltaneous solves)
*
* References:
* 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
* P. Ghysels and W. Vanroose, Parallel Computing, 2014.
* 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
* Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
* for High Performance Computing, Networking, Storage and Analysis, 2016.
* */
int dual_PIPECG( reax_system const * const system, control_params const * const control,
simulation_data * const data,
storage * const workspace, sparse_matrix * const H, rvec2 * const b,
real tol, rvec2 * const x, mpi_datatypes * const mpi_data )
{
int i, j;
rvec2 alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
real timings[5], redux[8];
MPI_Request req;
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_comm = 0.0;
t_allreduce = 0.0;
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, x, workspace->u2, H->NT );
#else
dual_Sparse_MatVec_local( H, x, workspace->u2, system->N );
#endif
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
//Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n );
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->r2[j][0] = b[j][0] - workspace->u2[j][0];
workspace->r2[j][1] = b[j][1] - workspace->u2[j][1];
Kurt A. O'Hearn
committed
t_vops += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
//Vector_Copy( workspace->u, workspace->r, system->n );
for ( j = 0; j < system->n ; ++j )
{
workspace->u2[j][0] = workspace->r2[j][0];
workspace->u2[j][1] = workspace->r2[j][1];
}
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->u2, H->NT );
#else
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->u2, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because u2 is only local portion */
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, workspace->u2, workspace->w2, H->NT );
#else
dual_Sparse_MatVec_local( H, workspace->u2, workspace->w2, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
//redux[0] = Dot_local( workspace->w, workspace->u, system->n );
//redux[1] = Dot_local( workspace->r, workspace->u, system->n );
//redux[2] = Dot_local( workspace->u, workspace->u, system->n );
//redux[3] = Dot_local( b, b, system->n );
for ( j = 0; j < 8; ++j )
Kurt A. O'Hearn
committed
redux[j] = 0.0;
Kurt A. O'Hearn
committed
for( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
redux[6] += b[j][0] * b[j][0];
redux[7] += b[j][1] * b[j][1];
Kurt A. O'Hearn
committed
t_vops += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
Kurt A. O'Hearn
committed
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
//Vector_Copy( workspace->m, workspace->w, system->n );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0];
workspace->m2[j][1] = workspace->w2[j][1];
}
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
#else
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because m2 is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, H->NT );
#else
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, system->N );
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta[0] = redux[0];
delta[1] = redux[1];
gamma_new[0] = redux[2];
gamma_new[1] = redux[3];
norm[0] = SQRT( redux[4] );
norm[1] = SQRT( redux[5] );
b_norm[0] = SQRT( redux[6] );
b_norm[1] = SQRT( redux[7] );
Kurt A. O'Hearn
committed
for ( i = 0; i < control->cm_solver_max_iters; ++i )
{
if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
{
break;
}
if ( i > 0 )
Kurt A. O'Hearn
committed
beta[0] = gamma_new[0] / gamma_old[0];
beta[1] = gamma_new[1] / gamma_old[1];
alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
}
else
{
beta[0] = 0.0;
beta[1] = 0.0;
alpha[0] = gamma_new[0] / delta[0];
alpha[1] = gamma_new[1] / delta[1];
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
//Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
//Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
//Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
//Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
//Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
//Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
//Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
//Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
//redux[0] = Dot_local( workspace->w, workspace->u, system->n );
//redux[1] = Dot_local( workspace->r, workspace->u, system->n );
//redux[2] = Dot_local( workspace->u, workspace->u, system->n );
for ( j = 0; j < 6; ++j )
Kurt A. O'Hearn
committed
redux[j] = 0.0;
}
for ( j = 0; j < system->n; ++j )
{
workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0];
workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1];
Kurt A. O'Hearn
committed
workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0];
workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1];
Kurt A. O'Hearn
committed
workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0];
workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1];
Kurt A. O'Hearn
committed
workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0];
workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1];
Kurt A. O'Hearn
committed
x[j][0] += alpha[0] * workspace->p2[j][0];
x[j][1] += alpha[1] * workspace->p2[j][1];
Kurt A. O'Hearn
committed
workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0];
workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1];
Kurt A. O'Hearn
committed
workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0];
workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1];
Kurt A. O'Hearn
committed
workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0];
workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1];
redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
Kurt A. O'Hearn
committed
t_vops += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
Kurt A. O'Hearn
committed
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
//Vector_Copy( workspace->m, workspace->w, system->n );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0];
workspace->m2[j][1] = workspace->w2[j][1];
}
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
#else
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because m2 is only local portion */
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, H->NT );
#else
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, system->N );
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
gamma_old[0] = gamma_new[0];
gamma_old[1] = gamma_new[1];
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta[0] = redux[0];
delta[1] = redux[1];
gamma_new[0] = redux[2];
gamma_new[1] = redux[3];
norm[0] = SQRT( redux[4] );
norm[1] = SQRT( redux[5] );
Kurt A. O'Hearn
committed
}
Kurt A. O'Hearn
committed
timings[0] = t_pa;
timings[1] = t_spmv;
timings[2] = t_vops;
timings[3] = t_comm;
timings[4] = t_allreduce;
Kurt A. O'Hearn
committed
if ( system->my_rank == MASTER_NODE )
{
MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
data->timing.cm_solver_spmv += timings[1] / control->nprocs;
data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
data->timing.cm_solver_comm += timings[3] / control->nprocs;
data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
}
else
{
Kurt A. O'Hearn
committed
MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
Kurt A. O'Hearn
committed
/* continue to solve the system that has not converged yet */
if ( norm[0] / b_norm[0] > tol )
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->s[j] = workspace->x[j][0];
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
i += PIPECG( system, control, data, workspace,
H, workspace->b_s, tol, workspace->s, mpi_data );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->x[j][0] = workspace->s[j];
Kurt A. O'Hearn
committed
else if ( norm[1] / b_norm[1] > tol )
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->t[j] = workspace->x[j][1];
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
i += PIPECG( system, control, data, workspace,
H, workspace->b_t, tol, workspace->t, mpi_data );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->x[j][1] = workspace->t[j];
Kurt A. O'Hearn
committed
if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn
committed
fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
return i;
Kurt A. O'Hearn
committed
return i;
Kurt A. O'Hearn
committed
/* Pipelined Preconditioned Conjugate Gradient Method
*
* References:
* 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
* P. Ghysels and W. Vanroose, Parallel Computing, 2014.
* 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
* Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
* for High Performance Computing, Networking, Storage and Analysis, 2016.
* */
int PIPECG( reax_system const * const system, control_params const * const control,
simulation_data * const data,
storage * const workspace, sparse_matrix * const H, real * const b,
real tol, real * const x, mpi_datatypes * const mpi_data )
Kurt A. O'Hearn
committed
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
int i, j;
real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
real timings[5], redux[4];
MPI_Request req;
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_comm = 0.0;
t_allreduce = 0.0;
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
x, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, x, workspace->u, H->NT );
#else
Sparse_MatVec_local( H, x, workspace->u, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->u, workspace->r, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because u is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->u, workspace->w, H->NT );
#else
Sparse_MatVec_local( H, workspace->u, workspace->w, system->N );
#endif
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
redux[0] = Dot_local( workspace->w, workspace->u, system->n );
redux[1] = Dot_local( workspace->r, workspace->u, system->n );
redux[2] = Dot_local( workspace->u, workspace->u, system->n );
redux[3] = Dot_local( b, b, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->m, workspace->w, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
Kurt A. O'Hearn
committed
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because m is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta = redux[0];
gamma_new = redux[1];
norm = SQRT( redux[2] );
b_norm = SQRT( redux[3] );
Kurt A. O'Hearn
committed
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
{
if ( i > 0 )
{
beta = gamma_new / gamma_old;
alpha = gamma_new / (delta - beta / alpha * gamma_new);
}
else
{
beta = 0.0;
alpha = gamma_new / delta;
}
t_start = MPI_Wtime( );
Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
redux[0] = Dot_local( workspace->w, workspace->u, system->n );
redux[1] = Dot_local( workspace->r, workspace->u, system->n );
redux[2] = Dot_local( workspace->u, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->m, workspace->w, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because m is only local portion */
}
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
gamma_old = gamma_new;
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta = redux[0];
gamma_new = redux[1];
norm = SQRT( redux[2] );
Kurt A. O'Hearn
committed
}
timings[0] = t_pa;
timings[1] = t_spmv;
timings[2] = t_vops;
timings[3] = t_comm;
timings[4] = t_allreduce;
Kurt A. O'Hearn
committed
MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
data->timing.cm_solver_spmv += timings[1] / control->nprocs;
data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
data->timing.cm_solver_comm += timings[3] / control->nprocs;
data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
}
else
{
Kurt A. O'Hearn
committed
MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
Kurt A. O'Hearn
committed
if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn
committed
fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
return i;
}
return i;
}
Kurt A. O'Hearn
committed
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
/* Pipelined Preconditioned Conjugate Residual Method
*
* References:
* 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
* P. Ghysels and W. Vanroose, Parallel Computing, 2014.
* */
int PIPECR( reax_system const * const system, control_params const * const control,
simulation_data * const data,
storage * const workspace, sparse_matrix * const H, real * const b,
real tol, real * const x, mpi_datatypes * const mpi_data )
{
int i, j;
real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
real timings[5], redux[3];
MPI_Request req;
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_comm = 0.0;
t_allreduce = 0.0;
t_start = MPI_Wtime( );
redux[0] = Dot_local( b, b, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
x, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, x, workspace->u, H->NT );
#else
Sparse_MatVec_local( H, x, workspace->u, system->N );
#endif
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->u, workspace->r, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
Kurt A. O'Hearn
committed
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, system->n );
Kurt A. O'Hearn
committed
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because u is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->u, workspace->w, H->NT );
#else
Sparse_MatVec_local( H, workspace->u, workspace->w, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
b_norm = SQRT( redux[0] );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
norm = Parallel_Norm( workspace->u, system->n, mpi_data->world );
t_allreduce += MPI_Wtime( ) - t_start;
for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
{
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
Vector_Copy( workspace->m, workspace->w, system->n );
Kurt A. O'Hearn
committed
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because m is only local portion */
}
t_start = MPI_Wtime( );
redux[0] = Dot_local( workspace->w, workspace->u, system->n );
redux[1] = Dot_local( workspace->m, workspace->w, system->n );
redux[2] = Dot_local( workspace->u, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
gamma_new = redux[0];
delta = redux[1];
norm = SQRT( redux[2] );
Kurt A. O'Hearn
committed
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
if ( i > 0 )
{
beta = gamma_new / gamma_old;
alpha = gamma_new / (delta - beta / alpha * gamma_new);
}
else
{
beta = 0.0;
alpha = gamma_new / delta;
}
t_start = MPI_Wtime( );
Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
t_vops += MPI_Wtime( ) - t_start;
gamma_old = gamma_new;
}
timings[0] = t_pa;
timings[1] = t_spmv;
timings[2] = t_vops;
timings[3] = t_comm;
timings[4] = t_allreduce;
if ( system->my_rank == MASTER_NODE )
{
MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
data->timing.cm_solver_spmv += timings[1] / control->nprocs;
data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
data->timing.cm_solver_comm += timings[3] / control->nprocs;
data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
}
else
{
Kurt A. O'Hearn
committed
MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
Kurt A. O'Hearn
committed
if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn
committed
fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" );