lin_alg.c

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
    {
#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, 
                H->NT, workspace->q );
#else
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, 
                system->N, workspace->q );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        alpha = sig_new / tmp;
        Vector_Add( x, alpha, workspace->d, system->n );
        Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r,
                workspace->q, FALSE, LEFT );
        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q,
                workspace->p, FALSE, RIGHT );

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        redux[0] = Dot_local( workspace->r, workspace->p, system->n );
        redux[1] = Dot_local( workspace->p, workspace->p, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        sig_old = sig_new;
        sig_new = redux[0];
        r_norm = SQRT( redux[1] );
        beta = sig_new / sig_old;
        Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    }

    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i );
        fprintf( stderr, "  [INFO] Rel. residual error: %e\n", r_norm / b_norm );
        return i;
    }

    return i;
}


/* Bi-conjugate gradient stabalized method with left preconditioning for
 * solving nonsymmetric linear systems.
 * This function performs dual iteration for QEq (2 simultaneous solves)
 *
 * system: 
 * workspace: struct containing storage for workspace for the linear solver
 * control: struct containing parameters governing the simulation and numeric methods
 * data: struct containing simulation data (e.g., atom info)
 * H: sparse, symmetric matrix, lower half stored in CSR format
 * b: right-hand side of the linear system
 * tol: tolerence compared against the relative residual for determining convergence
 * x: inital guess
 * mpi_data: 
 *
 * Reference: Netlib (in MATLAB)
 *  http://www.netlib.org/templates/matlab/bicgstab.m
 * */
int dual_BiCGStab( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, rvec2 * const b,
        real tol, rvec2 * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
{
    int i, j, ret;
    rvec2 tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm;
    real time, redux[4];

#if defined(NEUTRAL_TERRITORY)
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
            H->NT, workspace->d2 );
#else
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, 
            system->N, workspace->d2 );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->d2, system->n );
    Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] );
    Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[2], &redux[3] );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    b_norm[0] = SQRT( redux[0] );
    b_norm[1] = SQRT( redux[1] );
    r_norm[0] = SQRT( redux[2] );
    r_norm[1] = SQRT( redux[3] );
    if ( b_norm[0] == 0.0 )
    {
        b_norm[0] = 1.0;
    }
    if ( b_norm[1] == 0.0 )
    {
        b_norm[1] = 1.0;
    }
    Vector_Copy_rvec2( workspace->r_hat2, workspace->r2, system->n );
    omega[0] = 1.0;
    omega[1] = 1.0;
    rho[0] = 1.0;
    rho[1] = 1.0;

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    for ( i = 0; i < control->cm_solver_max_iters; ++i )
    {
        if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol )
        {
            break;
        }

        Dot_local_rvec2( workspace->r_hat2, workspace->r2, system->n,
                &redux[0], &redux[1] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        rho[0] = redux[0];
        rho[1] = redux[1];
        if ( rho[0] == 0.0 || rho[1] == 0.0 )
        {
            break;
        }
        if ( i > 0 )
        {
            beta[0] = (rho[0] / rho_old[0]) * (alpha[0] / omega[0]);
            beta[1] = (rho[1] / rho_old[1]) * (alpha[1] / omega[1]);
            Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->p2,
                    -1.0 * omega[0], -1.0 * omega[1], workspace->z2, system->n );
            Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->r2,
                    beta[0], beta[1], workspace->q2, system->n );
        }
        else
        {
            Vector_Copy_rvec2( workspace->p2, workspace->r2, system->n );
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->p2,
                workspace->y2, i == 0 ? fresh_pre : FALSE, LEFT );
        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2,
                workspace->d2, i == 0 ? fresh_pre : FALSE, RIGHT );

#if defined(NEUTRAL_TERRITORY)
        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2,
                H->NT, workspace->z2 );
#else
        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2,
                system->N, workspace->z2 );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        Dot_local_rvec2( workspace->r_hat2, workspace->z2, system->n,
                &redux[0], &redux[1] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        tmp[0] = redux[0];
        tmp[1] = redux[1];
        alpha[0] = rho[0] / tmp[0];
        alpha[1] = rho[1] / tmp[1];
        Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->r2,
                -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n );
        Dot_local_rvec2( workspace->q2, workspace->q2, system->n,
                &redux[0], &redux[1] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        tmp[0] = redux[0];
        tmp[1] = redux[1];
        /* early convergence check */
        if ( tmp[0] < tol || tmp[1] < tol )
        {
            Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n );
            break;
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2,
                workspace->y2, FALSE, LEFT );
        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2,
                workspace->q_hat2, FALSE, RIGHT );

#if defined(NEUTRAL_TERRITORY)
        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2,
                H->NT, workspace->y2 );
#else
        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2,
                system->N, workspace->y2 );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        Dot_local_rvec2( workspace->y2, workspace->q2, system->n, &redux[0], &redux[1] );
        Dot_local_rvec2( workspace->y2, workspace->y2, system->n, &redux[2], &redux[3] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        sigma[0] = redux[0];
        sigma[1] = redux[1];
        tmp[0] = redux[2];
        tmp[1] = redux[3];
        omega[0] = sigma[0] / tmp[0];
        omega[1] = sigma[1] / tmp[1];
        Vector_Sum_rvec2( workspace->g2, alpha[0], alpha[1], workspace->d2,
                omega[0], omega[1], workspace->q_hat2, system->n );
        Vector_Add_rvec2( x, 1.0, 1.0, workspace->g2, system->n );
        Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->q2,
                -1.0 * omega[0], -1.0 * omega[1], workspace->y2, system->n );
        Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[0], &redux[1] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        r_norm[0] = SQRT( redux[0] );
        r_norm[1] = SQRT( redux[1] );
        if ( omega[0] == 0.0 || omega[1] == 0.0 )
        {
            break;
        }
        rho_old[0] = rho[0];
        rho_old[1] = rho[1];

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    }

    if ( (omega[0] == 0.0 || omega[1] == 0.0) && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
        fprintf( stderr, "  [INFO] omega = %e\n", omega );
    }
    else if ( (rho[0] == 0.0 || rho[1] == 0.0) && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
        fprintf( stderr, "  [INFO] rho = %e\n", rho );
    }

    /* continue to solve the system that has not converged yet */
    if ( r_norm[0] / b_norm[0] > tol )
    {
        Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n );

        i += BiCGStab( system, control, data, workspace,
                H, workspace->b_s, tol, workspace->s, mpi_data, FALSE );

        Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n );
    }
    else if ( r_norm[1] / b_norm[1] > tol )
    {
        Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n );

        i += BiCGStab( system, control, data, workspace,
                H, workspace->b_t, tol, workspace->t, mpi_data, FALSE );

        Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n );
    }


    if ( i >= control->cm_solver_max_iters
            && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i );
        fprintf( stderr, "  [INFO] Rel. residual error (s solve): %e\n", r_norm[0] / b_norm[0] );
        fprintf( stderr, "  [INFO] Rel. residual error (t solve): %e\n", r_norm[1] / b_norm[1] );
    }

    return i;
}


/* Bi-conjugate gradient stabalized method with left preconditioning for
 * solving nonsymmetric linear systems
 *
 * system: 
 * workspace: struct containing storage for workspace for the linear solver
 * control: struct containing parameters governing the simulation and numeric methods
 * data: struct containing simulation data (e.g., atom info)
 * H: sparse, symmetric matrix, lower half stored in CSR format
 * b: right-hand side of the linear system
 * tol: tolerence compared against the relative residual for determining convergence
 * x: inital guess
 * mpi_data: 
 *
 * Reference: Netlib (in MATLAB)
 *  http://www.netlib.org/templates/matlab/bicgstab.m
 * */
int BiCGStab( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, real * const b,
        real tol, real * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
{
    int i, j, ret;
    real tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm;
    real time, redux[2];

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec( system, control, data, mpi_data, H, x, 
            H->NT, workspace->d );
#else
    Sparse_MatVec( system, control, data, mpi_data, H, x, 
            system->N, workspace->d );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->d, system->n );
    redux[0] = Dot_local( b, b, system->n );
    redux[1] = Dot_local( workspace->r, workspace->r, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    b_norm = SQRT( redux[0] );
    r_norm = SQRT( redux[1] );
    if ( b_norm == 0.0 )
    {
        b_norm = 1.0;
    }
    Vector_Copy( workspace->r_hat, workspace->r, system->n );
    omega = 1.0;
    rho = 1.0;

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
    {
        redux[0] = Dot_local( workspace->r_hat, workspace->r, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        rho = redux[0];
        if ( rho == 0.0 )
        {
            break;
        }
        if ( i > 0 )
        {
            beta = (rho / rho_old) * (alpha / omega);
            Vector_Sum( workspace->q, 1.0, workspace->p, -1.0 * omega, workspace->z, system->n );
            Vector_Sum( workspace->p, 1.0, workspace->r, beta, workspace->q, system->n );
        }
        else
        {
            Vector_Copy( workspace->p, workspace->r, system->n );
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->p,
                workspace->y, i == 0 ? fresh_pre : FALSE, LEFT );
        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y,
                workspace->d, i == 0 ? fresh_pre : FALSE, RIGHT );

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d,
                H->NT, workspace->z );
#else
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->d,
                system->N, workspace->z );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        redux[0] = Dot_local( workspace->r_hat, workspace->z, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        tmp = redux[0];
        alpha = rho / tmp;
        Vector_Sum( workspace->q, 1.0, workspace->r, -1.0 * alpha, workspace->z, system->n );
        redux[0] = Dot_local( workspace->q, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        tmp = redux[0];
        /* early convergence check */
        if ( tmp < tol )
        {
            Vector_Add( x, alpha, workspace->d, system->n );
            break;
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q,
                workspace->y, FALSE, LEFT );
        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y,
                workspace->q_hat, FALSE, RIGHT );

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat,
                H->NT, workspace->y );
#else
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat,
                system->N, workspace->y );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        redux[0] = Dot_local( workspace->y, workspace->q, system->n );
        redux[1] = Dot_local( workspace->y, workspace->y, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        sigma = redux[0];
        tmp = redux[1];
        omega = sigma / tmp;
        Vector_Sum( workspace->g, alpha, workspace->d, omega, workspace->q_hat, system->n );
        Vector_Add( x, 1.0, workspace->g, system->n );
        Vector_Sum( workspace->r, 1.0, workspace->q, -1.0 * omega, workspace->y, system->n );
        redux[0] = Dot_local( workspace->r, workspace->r, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        r_norm = SQRT( redux[0] );
        if ( omega == 0.0 )
        {
            break;
        }
        rho_old = rho;

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    }

    if ( omega == 0.0 && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
        fprintf( stderr, "  [INFO] omega = %e\n", omega );
    }
    else if ( rho == 0.0 && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i );
        fprintf( stderr, "  [INFO] rho = %e\n", rho );
    }
    else if ( i >= control->cm_solver_max_iters
            && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i );
        fprintf( stderr, "  [INFO] Rel. residual error: %e\n", r_norm / b_norm );
    }

    return i;
}


/* Dual iteration for the Pipelined Preconditioned Conjugate Gradient Method
 * for QEq (2 simaltaneous solves)
 *
 * References:
 * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
 *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
 * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
 *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
 *  for High Performance Computing, Networking, Storage and Analysis, 2016.
 *  */
int dual_PIPECG( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, rvec2 * const b,
        real tol, rvec2 * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
{
    int i, j, ret;
    rvec2 alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm;
    real redux[8];
    MPI_Request req;
#if defined(LOG_PERFORMANCE)
    real time;
#endif

#if defined(NEUTRAL_TERRITORY)
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x,
            H->NT, workspace->u2 );
#else
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, x,
            system->N, workspace->u2 );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->u2, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2,
            workspace->m2, fresh_pre, LEFT );
    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->m2,
            workspace->u2, fresh_pre, RIGHT );

#if defined(NEUTRAL_TERRITORY)
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2,
            H->NT, workspace->w2 );
#else
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2,
            system->N, workspace->w2 );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    for ( j = 0; j < 8; ++j )
    {
        redux[j] = 0.0;
    }
    Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] );
    Dot_local_rvec2( workspace->r2, workspace->u2, system->n, &redux[2], &redux[3] );
    Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] );
    Dot_local_rvec2( b, b, system->n, &redux[6], &redux[7] );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD, &req );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2,
            workspace->n2, fresh_pre, LEFT );
    dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
            workspace->m2, fresh_pre, RIGHT );

#if defined(NEUTRAL_TERRITORY)
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
            H->NT, workspace->n2 );
#else
    Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
            system->N, workspace->n2 );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
    delta[0] = redux[0];
    delta[1] = redux[1];
    gamma_new[0] = redux[2];
    gamma_new[1] = redux[3];
    r_norm[0] = SQRT( redux[4] );
    r_norm[1] = SQRT( redux[5] );
    b_norm[0] = SQRT( redux[6] );
    b_norm[1] = SQRT( redux[7] );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters; ++i )
    {
        if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol )
        {
            break;
        }
        if ( i > 0 )
        {
            beta[0] = gamma_new[0] / gamma_old[0];
            beta[1] = gamma_new[1] / gamma_old[1];
            alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]);
            alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]);
        }
        else
        {
            beta[0] = 0.0;
            beta[1] = 0.0;
            alpha[0] = gamma_new[0] / delta[0];
            alpha[1] = gamma_new[1] / delta[1];
        }

        Vector_Sum_rvec2( workspace->z2, 1.0, 1.0, workspace->n2,
                beta[0], beta[1], workspace->z2, system->n );
        Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->m2,
                beta[0], beta[1], workspace->q2, system->n );
        Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->u2,
                beta[0], beta[1], workspace->p2, system->n );
        Vector_Sum_rvec2( workspace->d2, 1.0, 1.0, workspace->w2,
                beta[0], beta[1], workspace->d2, system->n );
        Vector_Sum_rvec2( x, 1.0, 1.0, x,
                alpha[0], alpha[1], workspace->p2, system->n );
        Vector_Sum_rvec2( workspace->u2, 1.0, 1.0, workspace->u2,
                -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n );
        Vector_Sum_rvec2( workspace->w2, 1.0, 1.0, workspace->w2,
                -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n );
        Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->r2,
                -1.0 * alpha[0], -1.0 * alpha[1], workspace->d2, system->n );

        for ( j = 0; j < 6; ++j )
        {
            redux[j] = 0.0;
        }
        Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] );
        Dot_local_rvec2( workspace->r2, workspace->u2, system->n, &redux[2], &redux[3] );
        Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM,
                MPI_COMM_WORLD, &req );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2,
                workspace->n2, fresh_pre, LEFT );
        dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2,
                workspace->m2, fresh_pre, RIGHT );

#if defined(NEUTRAL_TERRITORY)
        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
                H->NT, workspace->n2 );
#else
        Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2,
                system->N, workspace->n2 );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );
#endif

        gamma_old[0] = gamma_new[0];
        gamma_old[1] = gamma_new[1];

        ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
        delta[0] = redux[0];
        delta[1] = redux[1];
        gamma_new[0] = redux[2];
        gamma_new[1] = redux[3];
        r_norm[0] = SQRT( redux[4] );
        r_norm[1] = SQRT( redux[5] );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif
    }

    /* continue to solve the system that has not converged yet */
    if ( r_norm[0] / b_norm[0] > tol )
    {
        Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n );

        i += PIPECG( system, control, data, workspace,
                H, workspace->b_s, tol, workspace->s, mpi_data, FALSE );

        Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n );
    }
    else if ( r_norm[1] / b_norm[1] > tol )
    {
        Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n );

        i += PIPECG( system, control, data, workspace,
                H, workspace->b_t, tol, workspace->t, mpi_data, FALSE );

        Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n );
    }

    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" );
        return i;
    }

    return i;
}


/* Pipelined Preconditioned Conjugate Gradient Method
 *
 * References:
 * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm,
 *  P. Ghysels and W. Vanroose, Parallel Computing, 2014.
 * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods,
 *  Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference
 *  for High Performance Computing, Networking, Storage and Analysis, 2016.
 *  */
int PIPECG( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, real * const b,
        real tol, real * const x, mpi_datatypes * const  mpi_data, int fresh_pre )
{
    int i, j, ret;
    real alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm;
    real redux[4];
    MPI_Request req;
#if defined(LOG_PERFORMANCE)
    real time;
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec( system, control, data, mpi_data, H, x, 
            H->NT, workspace->u );
#else
    Sparse_MatVec( system, control, data, mpi_data, H, x, 
            system->N, workspace->u );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->u, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r,
            workspace->m, fresh_pre, LEFT );
    apply_preconditioner( system, workspace, control, data, mpi_data, workspace->m,
            workspace->u, fresh_pre, RIGHT );

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec( system, control, data, mpi_data, H, workspace->u,
            H->NT, workspace->w );
#else
    Sparse_MatVec( system, control, data, mpi_data, H, workspace->u,
            system->N, workspace->w );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    redux[0] = Dot_local( workspace->w, workspace->u, system->n );
    redux[1] = Dot_local( workspace->r, workspace->u, system->n );
    redux[2] = Dot_local( workspace->u, workspace->u, system->n );
    redux[3] = Dot_local( b, b, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM,
            MPI_COMM_WORLD, &req );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

    apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w,
            workspace->n, FALSE, LEFT );
    apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n,
            workspace->m, FALSE, RIGHT );

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec( system, control, data, mpi_data, H, workspace->m,
            H->NT, workspace->n );
#else
    Sparse_MatVec( system, control, data, mpi_data, H, workspace->m,
            system->N, workspace->n );
#endif

#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif

    ret = MPI_Wait( &req, MPI_STATUS_IGNORE );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
    delta = redux[0];
    gamma_new = redux[1];
    r_norm = SQRT( redux[2] );
    b_norm = SQRT( redux[3] );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
    {
        if ( i > 0 )
        {
            beta = gamma_new / gamma_old;
            alpha = gamma_new / (delta - beta / alpha * gamma_new);
        }
        else
        {
            beta = 0.0;
            alpha = gamma_new / delta;
        }

        Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n );
        Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n );
        Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n );
        Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n );
        Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n );
        Vector_Sum( workspace->u, 1.0, workspace->u, -1.0 * alpha, workspace->q, system->n );
        Vector_Sum( workspace->w, 1.0, workspace->w, -1.0 * alpha, workspace->z, system->n );
        Vector_Sum( workspace->r, 1.0, workspace->r, -1.0 * alpha, workspace->d, system->n );
        redux[0] = Dot_local( workspace->w, workspace->u, system->n );
        redux[1] = Dot_local( workspace->r, workspace->u, system->n );
        redux[2] = Dot_local( workspace->u, workspace->u, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM,
                MPI_COMM_WORLD, &req );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w,
                workspace->n, FALSE, LEFT );
        apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n,
                workspace->m, FALSE, RIGHT );

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->m,
                H->NT, workspace->n );
#else
        Sparse_MatVec( system, control, data, mpi_data, H, workspace->m,
                system->N, workspace->n );
#endif

#if defined(LOG_PERFORMANCE)
        time = Get_Time( );