Skip to content
Snippets Groups Projects
lin_alg.c 135 KiB
Newer Older
                                    Vector_Copy( x, y, system->n );
                                }
                                break;
//                            case ICHOLT_PC:
//                            case ILUT_PC:
//                            case ILUTP_PC:
//                                  tri_solve( workspace->U, y, x, workspace->U->n, UPPER );
//                                  break;
                            default:
                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                                exit( INVALID_INPUT );
                                break;
                        }
                        break;
                    case TRI_SOLVE_LEVEL_SCHED_PA:
                        switch ( control->cm_solver_pre_comp_type )
                        {
                            case JACOBI_PC:
                            case SAI_PC:
                                if ( x != y )
                                {
                                    Vector_Copy( x, y, system->n );
                                }
                                break;
//                            case ICHOLT_PC:
//                            case ILUT_PC:
//                            case ILUTP_PC:
//                                  tri_solve_level_sched( (static_storage *) workspace,
//                                          workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
//                                  break;
                            default:
                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                                exit( INVALID_INPUT );
                                break;
                        }
                        break;
                    case TRI_SOLVE_GC_PA:
                        switch ( control->cm_solver_pre_comp_type )
                        {
                            case JACOBI_PC:
                            case SAI_PC:
                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                                exit( INVALID_INPUT );
                                break;
//                            case ICHOLT_PC:
//                            case ILUT_PC:
//                            case ILUTP_PC:
//                                  tri_solve_level_sched( (static_storage *) workspace,
//                                  workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
//                                  permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
//                                  break;
                            default:
                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                                exit( INVALID_INPUT );
                                break;
                        }
                        break;
                    case JACOBI_ITER_PA:
                        switch ( control->cm_solver_pre_comp_type )
                        {
                            case JACOBI_PC:
                            case SAI_PC:
                                fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" );
                                exit( INVALID_INPUT );
                                break;
//                            case ICHOLT_PC:
//                            case ILUT_PC:
//                            case ILUTP_PC:
//                                  if ( fresh_pre == TRUE )
//                                  {
//                                      for ( i = 0; i < workspace->U->n; ++i )
//                                      {
//                                          si = workspace->U->start[i];
//                                          workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
//                                      }
//                                  }
//
//                                  jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
//                                          y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
//                                  break;
                            default:
                                fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                                exit( INVALID_INPUT );
                                break;
                        }
                        break;
                    default:
                        fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                        exit( INVALID_INPUT );
                        break;

                }
                break;
        }
    }
}


/* Steepest Descent */
int SDM( reax_system const * const system, control_params const * const control,
        simulation_data * const data, storage * const workspace,
        sparse_matrix * const H, real * const b, real tol,
        real * const x, mpi_datatypes * const  mpi_data )
    real redux[2];
#if defined(LOG_PERFORMANCE)
    real time;
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec_local( H, x, workspace->q, H->NT );
#else
    Sparse_MatVec_local( H, x, workspace->q, system->N );
#endif

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    if ( control->cm_solver_pre_comp_type == NONE_PC )
    {
        Vector_Copy( workspace->d, workspace->r, system->n );
    }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    }
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
        
#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->d, H->NT );
#else
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->d, system->n );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

        /* no comm part2 because d is only local portion */
    }

    redux[0] = Dot_local( b, b, system->n );
    redux[1] = Dot_local( workspace->r, workspace->d, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / bnorm > tol; ++i )
    {
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( H, workspace->d, workspace->q, H->NT );
#else
        Sparse_MatVec_local( H, workspace->d, workspace->q, system->N );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        redux[0] = Dot_local( workspace->r, workspace->d, system->n );
        redux[1] = Dot_local( workspace->d, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        sig = redux[0];
        tmp = redux[1];
        alpha = sig / tmp;
        Vector_Add( x, alpha, workspace->d, system->n );
        Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
        /* pre-conditioning */
        if ( control->cm_solver_pre_comp_type == NONE_PC )
        {
            Vector_Copy( workspace->d, workspace->r, system->n );
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
        {
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                    workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );
#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
            
#if defined(NEUTRAL_TERRITORY)
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->d, H->NT );
#else
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->d, system->n );
#endif
#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
            /* no comm part2 because d is only local portion */
        }
    }

    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", i );
        fprintf( stderr, "  [INFO] Rel. residual error: %f\n", SQRT(sig) / bnorm );
        return i;
    }

    return i;
}


/* Dual iteration of the Preconditioned Conjugate Gradient Method
 * for QEq (2 simaltaneous solves) */
int dual_CG( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, rvec2 * const b,
        real tol, rvec2 * const x, mpi_datatypes * const  mpi_data )
{
    rvec2 tmp, alpha, beta, r_norm, b_norm, sig_old, sig_new;
    real redux[6];
#if defined(LOG_PERFORMANCE)
    real time;
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    dual_Sparse_MatVec_local( H, x, workspace->q2, H->NT );
#else
    dual_Sparse_MatVec_local( H, x, workspace->q2, system->N );
#endif

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    /* residual */
    for ( j = 0; j < system->n; ++j )
    {
        workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
        workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
    }

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
    if ( control->cm_solver_pre_comp_type == NONE_PC )
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->d2[j][0] = workspace->r2[j][0];
            workspace->d2[j][1] = workspace->r2[j][1];
        }
    }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
            workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    }
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
    {
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->d2, H->NT );
#else
        dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->d2, system->n );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

        /* no comm part2 because d2 is only local portion */
    }
    for ( j = 0; j < system->n; ++j )
    {
        redux[0] += workspace->r2[j][0] * workspace->d2[j][0];
        redux[1] += workspace->r2[j][1] * workspace->d2[j][1];
        
        redux[2] += workspace->d2[j][0] * workspace->d2[j][0];
        redux[3] += workspace->d2[j][1] * workspace->d2[j][1];

        redux[4] += b[j][0] * b[j][0];
        redux[5] += b[j][1] * b[j][1];
    }

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE,
            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
    r_norm[0] = SQRT( redux[2] );
    r_norm[1] = SQRT( redux[3] );
    b_norm[0] = SQRT( redux[4] );
    b_norm[1] = SQRT( redux[5] );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    for ( i = 0; i < control->cm_solver_max_iters; ++i )
    {
        if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol )
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->d2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        dual_Sparse_MatVec_local( H, workspace->d2, workspace->q2, H->NT );
#else
        dual_Sparse_MatVec_local( H, workspace->d2, workspace->q2, system->N );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->q2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        for ( j = 0; j < system->n; ++j )
        {
            redux[0] += workspace->d2[j][0] * workspace->q2[j][0];
            redux[1] += workspace->d2[j][1] * workspace->q2[j][1];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( &redux, &tmp, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        alpha[0] = sig_new[0] / tmp[0];
        alpha[1] = sig_new[1] / tmp[1];
        /* update x */
        for ( j = 0; j < system->n; ++j )
        {
            x[j][0] += alpha[0] * workspace->d2[j][0];
            x[j][1] += alpha[1] * workspace->d2[j][1];
        }
        /* update residual */
        for ( j = 0; j < system->n; ++j )
        {
            workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
            workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
        if ( control->cm_solver_pre_comp_type == NONE_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->p2[j][0] = workspace->r2[j][0];
                workspace->p2[j][1] = workspace->r2[j][1];
            }
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
                workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        }
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
        {
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                    workspace->r2, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 );

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
            dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->p2, H->NT );
#else
            dual_Sparse_MatVec_local( &workspace->H_app_inv, workspace->r2, workspace->p2, system->n );
#endif

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

            /* no comm part2 because p2 is only local portion */
        }

        redux[0] = 0.0;
        redux[1] = 0.0;
        redux[2] = 0.0;
        redux[3] = 0.0;
        /* dot products: r.p and p.p */
        for ( j = 0; j < system->n; ++j )
        {
            redux[0] += workspace->r2[j][0] * workspace->p2[j][0];
            redux[1] += workspace->r2[j][1] * workspace->p2[j][1];
            redux[2] += workspace->p2[j][0] * workspace->p2[j][0];
            redux[3] += workspace->p2[j][1] * workspace->p2[j][1];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif
        
        sig_old[0] = sig_new[0];
        sig_old[1] = sig_new[1];
        sig_new[0] = redux[0];
        sig_new[1] = redux[1];
        r_norm[0] = SQRT( redux[2] );
        r_norm[1] = SQRT( redux[3] );
        beta[0] = sig_new[0] / sig_old[0];
        beta[1] = sig_new[1] / sig_old[1];
        /* d = p + beta * d */
        for ( j = 0; j < system->n; ++j )
        {
            workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
            workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    }

    /* continue to solve the system that has not converged yet */
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->s[j] = workspace->x[j][0];
        }

        i += CG( system, control, data, workspace,
                H, workspace->b_s, tol, workspace->s, mpi_data );

        for ( j = 0; j < system->n; ++j )
        {
            workspace->x[j][0] = workspace->s[j];
        }
    }
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->t[j] = workspace->x[j][1];
        }

        i += CG( system, control, data, workspace,
                H, workspace->b_t, tol, workspace->t, mpi_data );

        for ( j = 0; j < system->n; ++j )
        {
            workspace->x[j][1] = workspace->t[j];
        }
    }

    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] CG convergence failed!\n" );
        return i;
    }

    return i;

}


/* Preconditioned Conjugate Gradient Method */
int CG( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, real * const b,
        real tol, real * const x, mpi_datatypes * const  mpi_data )
{
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec_local( H, x, workspace->q, H->NT );
#else
    Sparse_MatVec_local( H, x, workspace->q, system->N );
#endif

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    if ( control->cm_solver_pre_comp_type == NONE_PC )
    {
        Vector_Copy( workspace->d, workspace->r, system->n );
    }
    else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
    {
        for ( j = 0; j < system->n; ++j )
        {
            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
    }
    else if ( control->cm_solver_pre_comp_type == SAI_PC )
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
                workspace->r, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif
        
#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->d, H->NT );
#else
        Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->d, system->n );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

        /* no comm part2 because d is only local portion */
    }

    redux[0] = Dot_local( workspace->r, workspace->d, system->n );
    redux[1] = Dot_local( workspace->d, workspace->d, system->n );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE,
            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif
    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
        Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
        Sparse_MatVec_local( H, workspace->d, workspace->q, H->NT );
#else
        Sparse_MatVec_local( H, workspace->d, workspace->q, system->N );
#endif

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

        Sparse_MatVec_Comm_Part2( system, control, mpi_data,
                H->format, workspace->q, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

        tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        alpha = sig_new / tmp;
        Vector_Add( x, alpha, workspace->d, system->n );
        Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
        if ( control->cm_solver_pre_comp_type == NONE_PC )
        {
            Vector_Copy( workspace->p, workspace->r, system->n );
        }
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        }
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->p, H->NT );
#else
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->r, workspace->p, system->n );
#endif

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif

            /* no comm part2 because p is only local portion */
        }

        redux[0] = Dot_local( workspace->r, workspace->p, system->n );
        redux[1] = Dot_local( workspace->p, workspace->p, system->n );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        beta = sig_new / sig_old;
        Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
    }

    if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE )
    {
        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i );
        fprintf( stderr, "  [INFO] Rel. residual error: %e\n", r_norm / b_norm );
        return i;
    }

    return i;
}


/* Bi-conjugate gradient stabalized method with left preconditioning for
 * solving nonsymmetric linear systems
 *
 * system: 
 * workspace: struct containing storage for workspace for the linear solver
 * control: struct containing parameters governing the simulation and numeric methods
 * data: struct containing simulation data (e.g., atom info)
 * H: sparse, symmetric matrix, lower half stored in CSR format
 * b: right-hand side of the linear system
 * tol: tolerence compared against the relative residual for determining convergence
 * x: inital guess
 * mpi_data: 
 *
 * Reference: Netlib (in MATLAB)
 *  http://www.netlib.org/templates/matlab/bicgstab.m
 * */
int BiCGStab( reax_system const * const system, control_params const * const control,
        simulation_data * const data,
        storage * const workspace, sparse_matrix * const H, real * const b,
        real tol, real * const x, mpi_datatypes * const  mpi_data )
{
    real tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm;
    real time, redux[2];
#if defined(LOG_PERFORMANCE)
    time = Get_Time( );
#endif
    Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
    Sparse_MatVec_local( H, x, workspace->d, H->NT );
#else
    Sparse_MatVec_local( H, x, workspace->d, system->N );
#endif

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_spmv );
#endif

    Sparse_MatVec_Comm_Part2( system, control, mpi_data,
            H->format, workspace->d, REAL_PTR_TYPE, MPI_DOUBLE );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

    Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->d, system->n );
    redux[0] = Dot_local( b, b, system->n );
    redux[1] = Dot_local( workspace->r, workspace->r, system->n );

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE,
            MPI_SUM, MPI_COMM_WORLD );
    Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

    b_norm = SQRT( redux[0] );
    r_norm = SQRT( redux[1] );
    if ( b_norm == 0.0 )
    }
    Vector_Copy( workspace->r_hat, workspace->r, system->n );
    omega = 1.0;
    rho = 1.0;

#if defined(LOG_PERFORMANCE)
    Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

    for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
    {
        redux[0] = Dot_local( workspace->r_hat, workspace->r, system->n );

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif

        ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE,
                MPI_SUM, MPI_COMM_WORLD );
        Check_MPI_Error( ret, __FILE__, __LINE__ );
#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_allreduce );
#endif

        rho = redux[0];
        if ( rho == 0.0 )
        {
            break;
        }
        if ( i > 0 )
        {
            beta = (rho / rho_old) * (alpha / omega);
            Vector_Sum( workspace->q, 1.0, workspace->p, -1.0 * omega, workspace->z, system->n );
            Vector_Sum( workspace->p, 1.0, workspace->r, beta, workspace->q, system->n );
        }
        else
        {
            Vector_Copy( workspace->p, workspace->r, system->n );
        }

#if defined(LOG_PERFORMANCE)
        Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );
#endif
        if ( control->cm_solver_pre_comp_type == NONE_PC )
        {
            Vector_Copy( workspace->d, workspace->p, system->n );
        }
        else if ( control->cm_solver_pre_comp_type == JACOBI_PC )
        {
            for ( j = 0; j < system->n; ++j )
            {
                workspace->d[j] = workspace->p[j] * workspace->Hdia_inv[j];
            }

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif
        }
        else if ( control->cm_solver_pre_comp_type == SAI_PC )
            Sparse_MatVec_Comm_Part1( system, control, mpi_data,
#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_comm );
#endif

#if defined(NEUTRAL_TERRITORY)
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->p, workspace->d, H->NT );
#else
            Sparse_MatVec_local( &workspace->H_app_inv, workspace->p, workspace->d, system->n );
#endif

#if defined(LOG_PERFORMANCE)
            Update_Timing_Info( &time, &data->timing.cm_solver_pre_app );
#endif