Newer
Older
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
//Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n );
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->r2[j][0] = b[j][0] - workspace->u2[j][0];
workspace->r2[j][1] = b[j][1] - workspace->u2[j][1];
Kurt A. O'Hearn
committed
t_vops += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
//Vector_Copy( workspace->u, workspace->r, system->n );
for ( j = 0; j < system->n ; ++j )
{
workspace->u2[j][0] = workspace->r2[j][0];
workspace->u2[j][1] = workspace->r2[j][1];
}
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->u2, H->NT );
#else
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->u2, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because u2 is only local portion */
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, workspace->u2, workspace->w2, H->NT );
#else
dual_Sparse_MatVec_local( H, workspace->u2, workspace->w2, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
//redux[0] = Dot_local( workspace->w, workspace->u, system->n );
//redux[1] = Dot_local( workspace->r, workspace->u, system->n );
//redux[2] = Dot_local( workspace->u, workspace->u, system->n );
//redux[3] = Dot_local( b, b, system->n );
for ( j = 0; j < 8; ++j )
Kurt A. O'Hearn
committed
redux[j] = 0.0;
Kurt A. O'Hearn
committed
for( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
redux[6] += b[j][0] * b[j][0];
redux[7] += b[j][1] * b[j][1];
Kurt A. O'Hearn
committed
t_vops += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
Kurt A. O'Hearn
committed
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
//Vector_Copy( workspace->m, workspace->w, system->n );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0];
workspace->m2[j][1] = workspace->w2[j][1];
}
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
#else
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because m2 is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, H->NT );
#else
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, system->N );
Kurt A. O'Hearn
committed
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta[0] = redux[0];
delta[1] = redux[1];
gamma_new[0] = redux[2];
gamma_new[1] = redux[3];
norm[0] = sqrt( redux[4] );
norm[1] = sqrt( redux[5] );
b_norm[0] = sqrt( redux[6] );
b_norm[1] = sqrt( redux[7] );
for ( i = 0; i < control->cm_solver_max_iters; ++i )
{
if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
{
break;
}
if ( i > 0 )
Kurt A. O'Hearn
committed
beta[0] = gamma_new[0] / gamma_old[0];
beta[1] = gamma_new[1] / gamma_old[1];
alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
}
else
{
beta[0] = 0.0;
beta[1] = 0.0;
alpha[0] = gamma_new[0] / delta[0];
alpha[1] = gamma_new[1] / delta[1];
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
//Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
//Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
//Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
//Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
//Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
//Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
//Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
//Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
//redux[0] = Dot_local( workspace->w, workspace->u, system->n );
//redux[1] = Dot_local( workspace->r, workspace->u, system->n );
//redux[2] = Dot_local( workspace->u, workspace->u, system->n );
for ( j = 0; j < 6; ++j )
Kurt A. O'Hearn
committed
redux[j] = 0.0;
}
for ( j = 0; j < system->n; ++j )
{
workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0];
workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1];
Kurt A. O'Hearn
committed
workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0];
workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1];
Kurt A. O'Hearn
committed
workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0];
workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1];
Kurt A. O'Hearn
committed
workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0];
workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1];
Kurt A. O'Hearn
committed
x[j][0] += alpha[0] * workspace->p2[j][0];
x[j][1] += alpha[1] * workspace->p2[j][1];
Kurt A. O'Hearn
committed
workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0];
workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1];
Kurt A. O'Hearn
committed
workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0];
workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1];
Kurt A. O'Hearn
committed
workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0];
workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1];
redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
redux[5] += workspace->u2[j][1] * workspace->u2[j][1];
Kurt A. O'Hearn
committed
t_vops += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
Kurt A. O'Hearn
committed
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
//Vector_Copy( workspace->m, workspace->w, system->n );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0];
workspace->m2[j][1] = workspace->w2[j][1];
}
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
#else
dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because m2 is only local portion */
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, H->NT );
#else
dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, system->N );
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
gamma_old[0] = gamma_new[0];
gamma_old[1] = gamma_new[1];
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta[0] = redux[0];
delta[1] = redux[1];
gamma_new[0] = redux[2];
gamma_new[1] = redux[3];
norm[0] = sqrt( redux[4] );
norm[1] = sqrt( redux[5] );
}
Kurt A. O'Hearn
committed
timings[0] = t_pa;
timings[1] = t_spmv;
timings[2] = t_vops;
timings[3] = t_comm;
timings[4] = t_allreduce;
Kurt A. O'Hearn
committed
if ( system->my_rank == MASTER_NODE )
{
MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
data->timing.cm_solver_spmv += timings[1] / control->nprocs;
data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
data->timing.cm_solver_comm += timings[3] / control->nprocs;
data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
}
else
{
MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
Kurt A. O'Hearn
committed
/* continue to solve the system that has not converged yet */
if ( norm[0] / b_norm[0] > tol )
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->s[j] = workspace->x[j][0];
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
i += PIPECG( system, control, data, workspace,
H, workspace->b_s, tol, workspace->s, mpi_data );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->x[j][0] = workspace->s[j];
Kurt A. O'Hearn
committed
else if ( norm[1] / b_norm[1] > tol )
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->t[j] = workspace->x[j][1];
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
i += PIPECG( system, control, data, workspace,
H, workspace->b_t, tol, workspace->t, mpi_data );
Kurt A. O'Hearn
committed
Kurt A. O'Hearn
committed
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->x[j][1] = workspace->t[j];
Kurt A. O'Hearn
committed
if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn
committed
fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
return i;
Kurt A. O'Hearn
committed
return i;
Kurt A. O'Hearn
committed
/* Pipelined Preconditioned Conjugate Gradient Method
*
* References:
* 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
* P. Ghysels and W. Vanroose, Parallel Computing, 2014.
* 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
* Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
* for High Performance Computing, Networking, Storage and Analysis, 2016.
* */
int PIPECG( reax_system const * const system, control_params const * const control,
simulation_data * const data,
storage * const workspace, sparse_matrix * const H, real * const b,
real tol, real * const x, mpi_datatypes * const mpi_data )
Kurt A. O'Hearn
committed
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
int i, j;
real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
real timings[5], redux[4];
MPI_Request req;
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_comm = 0.0;
t_allreduce = 0.0;
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
x, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, x, workspace->u, H->NT );
#else
Sparse_MatVec_local( H, x, workspace->u, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->u, workspace->r, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because u is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->u, workspace->w, H->NT );
#else
Sparse_MatVec_local( H, workspace->u, workspace->w, system->N );
#endif
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
redux[0] = Dot_local( workspace->w, workspace->u, system->n );
redux[1] = Dot_local( workspace->r, workspace->u, system->n );
redux[2] = Dot_local( workspace->u, workspace->u, system->n );
redux[3] = Dot_local( b, b, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->m, workspace->w, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
Kurt A. O'Hearn
committed
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because m is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn
committed
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta = redux[0];
gamma_new = redux[1];
norm = sqrt( redux[2] );
b_norm = sqrt( redux[3] );
for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
{
if ( i > 0 )
{
beta = gamma_new / gamma_old;
alpha = gamma_new / (delta - beta / alpha * gamma_new);
}
else
{
beta = 0.0;
alpha = gamma_new / delta;
}
t_start = MPI_Wtime( );
Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
redux[0] = Dot_local( workspace->w, workspace->u, system->n );
redux[1] = Dot_local( workspace->r, workspace->u, system->n );
redux[2] = Dot_local( workspace->u, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->m, workspace->w, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because m is only local portion */
}
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
gamma_old = gamma_new;
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
delta = redux[0];
gamma_new = redux[1];
norm = sqrt( redux[2] );
}
timings[0] = t_pa;
timings[1] = t_spmv;
timings[2] = t_vops;
timings[3] = t_comm;
timings[4] = t_allreduce;
Kurt A. O'Hearn
committed
MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
data->timing.cm_solver_spmv += timings[1] / control->nprocs;
data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
data->timing.cm_solver_comm += timings[3] / control->nprocs;
data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
}
else
{
MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
Kurt A. O'Hearn
committed
if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn
committed
fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
return i;
}
return i;
}
Kurt A. O'Hearn
committed
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
/* Pipelined Preconditioned Conjugate Residual Method
*
* References:
* 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
* P. Ghysels and W. Vanroose, Parallel Computing, 2014.
* */
int PIPECR( reax_system const * const system, control_params const * const control,
simulation_data * const data,
storage * const workspace, sparse_matrix * const H, real * const b,
real tol, real * const x, mpi_datatypes * const mpi_data )
{
int i, j;
real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
real t_start, t_pa, t_spmv, t_vops, t_comm, t_allreduce;
real timings[5], redux[3];
MPI_Request req;
t_pa = 0.0;
t_spmv = 0.0;
t_vops = 0.0;
t_comm = 0.0;
t_allreduce = 0.0;
t_start = MPI_Wtime( );
redux[0] = Dot_local( b, b, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
x, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, x, workspace->u, H->NT );
#else
Sparse_MatVec_local( H, x, workspace->u, system->N );
#endif
Kurt A. O'Hearn
committed
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
{
Vector_Copy( workspace->u, workspace->r, system->n );
}
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn
committed
workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
Kurt A. O'Hearn
committed
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, system->n );
Kurt A. O'Hearn
committed
t_pa += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
/* no comm part2 because u is only local portion */
}
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn
committed
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->u, workspace->w, H->NT );
#else
Sparse_MatVec_local( H, workspace->u, workspace->w, system->N );
#endif
t_spmv += MPI_Wtime( ) - t_start;
Kurt A. O'Hearn
committed
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
b_norm = sqrt( redux[0] );
t_start = MPI_Wtime( );
norm = Parallel_Norm( workspace->u, system->n, mpi_data->world );
t_allreduce += MPI_Wtime( ) - t_start;
for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
{
/* pre-conditioning */
if ( control->cm_solver_pre_comp_type == NONE_PC )
Kurt A. O'Hearn
committed
Vector_Copy( workspace->m, workspace->w, system->n );
Kurt A. O'Hearn
committed
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
{
t_start = MPI_Wtime( );
for ( j = 0; j < system->n; ++j )
{
workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
}
t_pa += MPI_Wtime( ) - t_start;
}
else if ( control->cm_solver_pre_comp_type == SAI_PC )
{
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
#endif
t_pa += MPI_Wtime( ) - t_start;
/* no comm part2 because m is only local portion */
}
t_start = MPI_Wtime( );
redux[0] = Dot_local( workspace->w, workspace->u, system->n );
redux[1] = Dot_local( workspace->m, workspace->w, system->n );
redux[2] = Dot_local( workspace->u, workspace->u, system->n );
t_vops += MPI_Wtime( ) - t_start;
MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, mpi_data->world, &req );
t_comm += Sparse_MatVec_Comm_Part1( system, control, mpi_data,
workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
#if defined(NEUTRAL_TERRITORY)
Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
Kurt A. O'Hearn
committed
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
t_spmv += MPI_Wtime( ) - t_start;
t_comm += Sparse_MatVec_Comm_Part2( system, control, mpi_data,
H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );
t_start = MPI_Wtime( );
MPI_Wait( &req, MPI_STATUS_IGNORE );
t_allreduce += MPI_Wtime( ) - t_start;
gamma_new = redux[0];
delta = redux[1];
norm = sqrt( redux[2] );
if ( i > 0 )
{
beta = gamma_new / gamma_old;
alpha = gamma_new / (delta - beta / alpha * gamma_new);
}
else
{
beta = 0.0;
alpha = gamma_new / delta;
}
t_start = MPI_Wtime( );
Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
t_vops += MPI_Wtime( ) - t_start;
gamma_old = gamma_new;
}
timings[0] = t_pa;
timings[1] = t_spmv;
timings[2] = t_vops;
timings[3] = t_comm;
timings[4] = t_allreduce;
if ( system->my_rank == MASTER_NODE )
{
MPI_Reduce( MPI_IN_PLACE, timings, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
data->timing.cm_solver_pre_app += timings[0] / control->nprocs;
data->timing.cm_solver_spmv += timings[1] / control->nprocs;
data->timing.cm_solver_vector_ops += timings[2] / control->nprocs;
data->timing.cm_solver_comm += timings[3] / control->nprocs;
data->timing.cm_solver_allreduce += timings[4] / control->nprocs;
}
else
{
MPI_Reduce( timings, NULL, 5, MPI_DOUBLE, MPI_SUM, MASTER_NODE, mpi_data->world );
Kurt A. O'Hearn
committed
if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn
committed
fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" );