Skip to content
Snippets Groups Projects
lin_alg.c 135 KiB
Newer Older
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( H, workspace->d, workspace->z, H->NT );
#else
        Sparse_MatVec_local( H, workspace->d, workspace->z, system->N );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->z, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        redux[0] = Dot_local( workspace->r_hat, workspace->z, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        tmp = redux[0];
        alpha = rho / tmp;
        Vector_Sum( workspace->q, 1.0, workspace->r, -1.0 * alpha, workspace->z, system->n );
        redux[0] = Dot_local( workspace->q, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        tmp = redux[0];
        /* early convergence check */
        if ( tmp < tol )
        {
            Vector_Add( x, alpha, workspace->d, system->n );
            break;
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
        if ( control->cm_solver_pre_comp_type == NONE_PC )
        {
            Vector_Copy( workspace->q_hat, workspace->q, system->n );
        }
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->q_hat[j] = workspace->q[j] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        }
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->q, workspace->q_hat, H->NT );
#else
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->q, workspace->q_hat, system->n );
#endif

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

            /* no comm part2 because q_hat is only local portion */
        }

        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->q_hat, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( H, workspace->q_hat, workspace->y, H->NT );
#else
        Sparse_MatVec_local( H, workspace->q_hat, workspace->y, system->N );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->y, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        redux[0] = Dot_local( workspace->y, workspace->q, system->n );
        redux[1] = Dot_local( workspace->y, workspace->y, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        sigma = redux[0];
        tmp = redux[1];
        omega = sigma / tmp;
        Vector_Sum( workspace->g, alpha, workspace->d, omega, workspace->q_hat, system->n );
        Vector_Add( x, 1.0, workspace->g, system->n );
        Vector_Sum( workspace->r, 1.0, workspace->q, -1.0 * omega, workspace->y, system->n );
        redux[0] = Dot_local( workspace->r, workspace->r, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        r_norm = SQRT( redux[0] );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    if ( omega == 0.0 && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
        fprintf( stderr, "  [INFO] omega = %e\n", omega );
    }
    else if ( rho == 0.0 && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
        fprintf( stderr, "  [INFO] rho = %e\n", rho );
    }
    else if ( i >= control->cm_solver_max_iters
            && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i );
        fprintf( stderr, "  [INFO] Rel. residual error: %e\n", r_norm / b_norm );
/* Dual iteration for the Pipelined Preconditioned Conjugate Gradient Method
 * for QEq (2 simaltaneous solves)
 *
 * References:
 * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
 *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
 * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
 *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
 *  for High Performance Computing, Networking, Storage and Analysis, 2016.
 *  */
int dual_PIPECG( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, rvec2 * const b,
        real tol, rvec2 * const x, mpi_datatypes * const  mpi_data )
{
    rvec2 alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    dual_Sparse_MatVec_local( H, x, workspace->u2, H->NT );
#else
    dual_Sparse_MatVec_local( H, x, workspace->u2, system->N );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    //Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );
    for ( j = 0; j < system->n; ++j )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
        workspace->r2[j][0] = b[j][0] - workspace->u2[j][0];
        workspace->r2[j][1] = b[j][1] - workspace->u2[j][1];

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    /* pre-conditioning */
    if ( control->cm_solver_pre_comp_type == NONE_PC )
        //Vector_Copy( workspace->u, workspace->r, system->n );
        for ( j = 0; j < system->n ; ++j )
        {
            workspace->u2[j][0] = workspace->r2[j][0];
            workspace->u2[j][1] = workspace->r2[j][1];
        }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        for ( j = 0; j < system->n; ++j )
        {
            workspace->u2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
            workspace->u2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
        
#if defined(NEUTRAL_TERRITORY)
        dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->u2, H->NT );
#else
        dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->u2, system->n );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

        /* no comm part2 because u2 is only local portion */
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
            workspace->u2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    dual_Sparse_MatVec_local( H, workspace->u2, workspace->w2, H->NT );
#else
    dual_Sparse_MatVec_local( H, workspace->u2, workspace->w2, system->N );
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    //redux[0] = Dot_local( workspace->w, workspace->u, system->n );
    //redux[1] = Dot_local( workspace->r, workspace->u, system->n );
    //redux[2] = Dot_local( workspace->u, workspace->u, system->n );
    //redux[3] = Dot_local( b, b, system->n );
    for ( j = 0; j < 8; ++j )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
        redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
        redux[1] += workspace->w2[j][1] * workspace->u2[j][1];

        redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
        redux[3] += workspace->r2[j][1] * workspace->u2[j][1];

        redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
        redux[5] += workspace->u2[j][1] * workspace->u2[j][1];

        redux[6] += b[j][0] * b[j][0];
        redux[7] += b[j][1] * b[j][1];
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD, &req );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
    /* pre-conditioning */
    if ( control->cm_solver_pre_comp_type == NONE_PC )
        //Vector_Copy( workspace->m, workspace->w, system->n );
        for ( j = 0; j < system->n; ++j )
        {
            workspace->m2[j][0] = workspace->w2[j][0];
            workspace->m2[j][1] = workspace->w2[j][1];
        }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
            workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    }
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
        
#if defined(NEUTRAL_TERRITORY)
        dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
#else
        dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        /* no comm part2 because m2 is only local portion */
    }
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
            workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, H->NT );
#else
    dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, system->N );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#endif
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
    delta[0] = redux[0];
    delta[1] = redux[1];
    gamma_new[0] = redux[2];
    gamma_new[1] = redux[3];
    norm[0] = SQRT( redux[4] );
    norm[1] = SQRT( redux[5] );
    b_norm[0] = SQRT( redux[6] );
    b_norm[1] = SQRT( redux[7] );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters; ++i )
    {
        if ( norm[0] / b_norm[0] <= tol || norm[1] / b_norm[1] <= tol )
        {
            break;
        }
        if ( i > 0 )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
            beta[0] = gamma_new[0] / gamma_old[0];
            beta[1] = gamma_new[1] / gamma_old[1];
            alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
            alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
        }
        else
        {
            beta[0] = 0.0;
            beta[1] = 0.0;
            alpha[0] = gamma_new[0] / delta[0];
            alpha[1] = gamma_new[1] / delta[1];
        //Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
        //Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
        //Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
        //Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
        //Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
        //Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
        //Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
        //Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
        //redux[0] = Dot_local( workspace->w, workspace->u, system->n );
        //redux[1] = Dot_local( workspace->r, workspace->u, system->n );
        //redux[2] = Dot_local( workspace->u, workspace->u, system->n );
        for ( j = 0; j < 6; ++j )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        {
            redux[j] = 0.0;
        }
        for ( j = 0; j < system->n; ++j )
        {
            workspace->z2[j][0] = workspace->n2[j][0] + beta[0] * workspace->z2[j][0];
            workspace->z2[j][1] = workspace->n2[j][1] + beta[1] * workspace->z2[j][1];
            workspace->q2[j][0] = workspace->m2[j][0] + beta[0] * workspace->q2[j][0];
            workspace->q2[j][1] = workspace->m2[j][1] + beta[1] * workspace->q2[j][1];
            workspace->p2[j][0] = workspace->u2[j][0] + beta[0] * workspace->p2[j][0];
            workspace->p2[j][1] = workspace->u2[j][1] + beta[1] * workspace->p2[j][1];
            workspace->d2[j][0] = workspace->w2[j][0] + beta[0] * workspace->d2[j][0];
            workspace->d2[j][1] = workspace->w2[j][1] + beta[1] * workspace->d2[j][1];
            x[j][0] += alpha[0] * workspace->p2[j][0];
            x[j][1] += alpha[1] * workspace->p2[j][1];
            workspace->u2[j][0] -= alpha[0] * workspace->q2[j][0];
            workspace->u2[j][1] -= alpha[1] * workspace->q2[j][1];
            workspace->w2[j][0] -= alpha[0] * workspace->z2[j][0];
            workspace->w2[j][1] -= alpha[1] * workspace->z2[j][1];
            workspace->r2[j][0] -= alpha[0] * workspace->d2[j][0];
            workspace->r2[j][1] -= alpha[1] * workspace->d2[j][1];

            redux[0] += workspace->w2[j][0] * workspace->u2[j][0];
            redux[1] += workspace->w2[j][1] * workspace->u2[j][1];
            
            redux[2] += workspace->r2[j][0] * workspace->u2[j][0];
            redux[3] += workspace->r2[j][1] * workspace->u2[j][1];
            
            redux[4] += workspace->u2[j][0] * workspace->u2[j][0];
            redux[5] += workspace->u2[j][1] * workspace->u2[j][1];

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
        ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM,
                MPI_COMM_WORLD, &req );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
        /* pre-conditioning */
        if ( control->cm_solver_pre_comp_type == NONE_PC )
            //Vector_Copy( workspace->m, workspace->w, system->n );
            for ( j = 0; j < system->n; ++j )
            {
                workspace->m2[j][0] = workspace->w2[j][0];
                workspace->m2[j][1] = workspace->w2[j][1];
            }
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->m2[j][0] = workspace->w2[j][0] * workspace->Hdia_inv[j];
                workspace->m2[j][1] = workspace->w2[j][1] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        }
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                    workspace->w2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
            
#if defined(NEUTRAL_TERRITORY)
            dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, H->NT );
#else
            dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->w2, workspace->m2, system->n );
#endif

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

            /* no comm part2 because m2 is only local portion */
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
        }
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->m2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, H->NT );
#else
        dual_Sparse_MatVec_local( H, workspace->m2, workspace->n2, system->N );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#endif
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->n2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        gamma_old[0] = gamma_new[0];
        gamma_old[1] = gamma_new[1];

        ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
        delta[0] = redux[0];
        delta[1] = redux[1];
        gamma_new[0] = redux[2];
        gamma_new[1] = redux[3];
        norm[0] = SQRT( redux[4] );
        norm[1] = SQRT( redux[5] );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif
    /* continue to solve the system that has not converged yet */
    if ( norm[0] / b_norm[0] > tol )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
        i += PIPECG( system, control, data, workspace,
                H, workspace->b_s, tol, workspace->s, mpi_data );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
        i += PIPECG( system, control, data, workspace,
                H, workspace->b_t, tol, workspace->t, mpi_data );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    }
    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
        return i;
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

/* Pipelined Preconditioned Conjugate Gradient Method
 *
 * References:
 * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
 *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
 * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
 *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
 *  for High Performance Computing, Networking, Storage and Analysis, 2016.
 *  */
int PIPECG( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, real * const b,
        real tol, real * const x, mpi_datatypes * const  mpi_data )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
{
    real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec_local( H, x, workspace->u, H->NT );
#else
    Sparse_MatVec_local( H, x, workspace->u, system->N );
#endif

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    Vector_Sum( workspace->r , 1.0,  b, -1.0, workspace->u, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    /* pre-conditioning */
    if ( control->cm_solver_pre_comp_type == NONE_PC )
    {
        Vector_Copy( workspace->u, workspace->r, system->n );
    }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->u[j] = workspace->r[j] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    }
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
    {
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
        
#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, H->NT );
#else
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->u, system->n );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        /* no comm part2 because u is only local portion */
    }
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
            workspace->u, REAL_PTR_TYPE, MPI_DOUBLE );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec_local( H, workspace->u, workspace->w, H->NT );
#else
    Sparse_MatVec_local( H, workspace->u, workspace->w, system->N );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    redux[0] = Dot_local( workspace->w, workspace->u, system->n );
    redux[1] = Dot_local( workspace->r, workspace->u, system->n );
    redux[2] = Dot_local( workspace->u, workspace->u, system->n );
    redux[3] = Dot_local( b, b, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD, &req );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

    /* pre-conditioning */
    if ( control->cm_solver_pre_comp_type == NONE_PC )
    {
        Vector_Copy( workspace->m, workspace->w, system->n );
    }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        for ( j = 0; j < system->n; ++j )
        {
            workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
    {
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
        
#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

        /* no comm part2 because m is only local portion */
    }
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
            workspace->m, REAL_PTR_TYPE, MPI_DOUBLE );
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
    Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
#endif

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
    norm = SQRT( redux[2] );
    b_norm = SQRT( redux[3] );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters && norm / b_norm > tol; ++i )
    {
        if ( i > 0 )
        {
            beta = gamma_new / gamma_old;
            alpha = gamma_new / (delta - beta / alpha * gamma_new);
        }
        else
        {
            beta = 0.0;
            alpha = gamma_new / delta;
        }

        Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
        Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
        Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
        Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
        Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
        Vector_Sum( workspace->u, 1.0, workspace->u, -alpha, workspace->q, system->n );
        Vector_Sum( workspace->w, 1.0, workspace->w, -alpha, workspace->z, system->n );
        Vector_Sum( workspace->r, 1.0, workspace->r, -alpha, workspace->d, system->n );
        redux[0] = Dot_local( workspace->w, workspace->u, system->n );
        redux[1] = Dot_local( workspace->r, workspace->u, system->n );
        redux[2] = Dot_local( workspace->u, workspace->u, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
        ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM,
                MPI_COMM_WORLD, &req );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

        /* pre-conditioning */
        if ( control->cm_solver_pre_comp_type == NONE_PC )
        {
            Vector_Copy( workspace->m, workspace->w, system->n );
        }
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->m[j] = workspace->w[j] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        }
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
        {
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                    workspace->w, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
            
#if defined(NEUTRAL_TERRITORY)
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, H->NT );
#else
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->w, workspace->m, system->n );
#endif

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( H, workspace->m, workspace->n, H->NT );
#else
        Sparse_MatVec_local( H, workspace->m, workspace->n, system->N );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->n, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed

    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
Kurt A. O'Hearn's avatar
Kurt A. O'Hearn committed
    {
        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
        return i;
    }

    return i;
}

/* Pipelined Preconditioned Conjugate Residual Method
 *
 * References:
 * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
 *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
 *  */
int PIPECR( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, real * const b,
        real tol, real * const x, mpi_datatypes * const  mpi_data )
{
    real alpha, beta, delta, gamma_old, gamma_new, norm, b_norm;

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD, &req );