Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
cuda_forces.cu 65.41 KiB

#include "cuda_forces.h"

#include "cuda_bonds.h"
#include "cuda_bond_orders.h"
#include "cuda_charges.h"
#include "cuda_helpers.h"
#include "cuda_hydrogen_bonds.h"
#include "cuda_lin_alg.h"
#include "cuda_list.h"
#include "cuda_multi_body.h"
#include "cuda_neighbors.h"
#include "cuda_nonbonded.h"
#include "cuda_reduction.h"
#include "cuda_torsion_angles.h"
#include "cuda_utils.h"
#include "cuda_valence_angles.h"
#if defined(DEBUG_FOCUS)
  #include "cuda_validation.h"
#endif

#include "../basic_comm.h"
#include "../forces.h"
#include "../index_utils.h"
#include "../tool_box.h"
#include "../vector.h"


typedef enum
{
    DIAGONAL = 0,
    OFF_DIAGONAL = 1,
} MATRIX_ENTRY_POSITION;


CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
{
    control->hbond_cut = 0.0;
}


CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N )
{
    int i;

    i = blockIdx.x * blockDim.x  + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

    end_indices[i] = indices[i] + intr_cnt[i];
}


CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N )
{
    int i;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

    my_atoms[i].Hindex = i;
}


CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp,
        int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N )
{
    int i, hindex, my_hbonds;

    i = blockIdx.x * blockDim.x  + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

    hindex = atoms[i].Hindex;

    if ( sbp[ atoms[i].type ].p_hbond == H_ATOM || 
            sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM )
    {
        my_hbonds = hbonds[i];
        indices[hindex] = max_hbonds[i];
        end_indices[hindex] = indices[hindex] + hbonds[i];
    }
    else
    {
        my_hbonds = 0;
        indices[hindex] = 0;
        end_indices[hindex] = 0;
    }
    atoms[i].num_hbonds = my_hbonds;
}


/* Initialize indices for far neighbors list post reallocation
 *
 * system: atomic system info. */
void Cuda_Init_Neighbor_Indices( reax_system *system )
{
    int blocks;
    reax_list *far_nbrs = lists[FAR_NBRS];

    /* init indices */
    Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbrs->index, system->total_cap );

    /* init end_indices */
    blocks = system->total_cap / DEF_BLOCK_SIZE + 
        ((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1);
    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_far_nbrs, far_nbrs->index, far_nbrs->end_index, system->total_cap );
    cudaDeviceSynchronize( );
    cudaCheckError( );
}


/* Initialize indices for far hydrogen bonds list post reallocation
 *
 * system: atomic system info. */
void Cuda_Init_HBond_Indices( reax_system *system )
{
    int blocks;
    int *temp;
    reax_list *hbonds = lists[HBONDS];

    temp = (int *) scratch;

    /* init Hindices */
    blocks = system->N / DEF_BLOCK_SIZE + 
        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);

    k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_my_atoms, system->N );
    cudaDeviceSynchronize( );
    cudaCheckError( );

    /* init indices and end_indices */
    Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap );

    k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp, 
          hbonds->index, hbonds->end_index, system->N );
    cudaDeviceSynchronize( );
    cudaCheckError( );
}


/* Initialize indices for far bonds list post reallocation
 *
 * system: atomic system info. */
void Cuda_Init_Bond_Indices( reax_system *system )
{
    int blocks;
    reax_list *bonds = lists[BONDS];

    /* init indices */
    Cuda_Scan_Excl_Sum( system->d_max_bonds, bonds->index, system->total_cap );

    /* init end_indices */
    blocks = system->N / DEF_BLOCK_SIZE + 
        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_bonds, bonds->index, bonds->end_index, system->N );
    cudaDeviceSynchronize( );
    cudaCheckError( );
}


/* Initialize indices for charge matrix post reallocation
 *
 * system: atomic system info.
 * H: charge matrix */
void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
{
    int blocks;

    /* init indices */
    Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->total_cap );

    /* init end_indices */
    blocks = system->total_cap / DEF_BLOCK_SIZE
        + ((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1);
    k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_cm_entries, H->start, H->end, system->total_cap );
    cudaDeviceSynchronize( );
    cudaCheckError( );
}


/* Initialize indices for three body list post reallocation
 *
 * indices: list indices
 * entries: num. of entries in list */
void Cuda_Init_Three_Body_Indices( int *indices, int entries )
{
    reax_list *thbody = lists[THREE_BODIES];

    Cuda_Scan_Excl_Sum( indices, thbody->index, entries );
}


CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, 
        single_body_parameters *sbp, two_body_parameters *tbp,
        control_params *control, reax_list far_nbrs, 
        int num_atom_types, int n, int N, int total_cap,
        int *cm_entries, int *max_cm_entries,
        int *bonds, int *max_bonds,
        int *hbonds, int *max_hbonds )
{
    int i, j, pj; 
    int start_i, end_i;
    int type_i, type_j;
    int ihb, jhb;
    int local;
    int num_bonds, num_hbonds, num_cm_entries;
    real cutoff;
    real r_ij; 
    real C12, C34, C56;
    real BO, BO_s, BO_pi, BO_pi2;
    single_body_parameters *sbp_i, *sbp_j;
    two_body_parameters *twbp;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= total_cap )
    {
        return;
    }

    num_bonds = 0;
    num_hbonds = 0;
    num_cm_entries = 0;

    if ( i < N )
    {
        atom_i = &my_atoms[i];
        type_i = atom_i->type;
        start_i = Cuda_Start_Index( i, &far_nbrs );
        end_i = Cuda_End_Index( i, &far_nbrs );
        sbp_i = &sbp[type_i];

        if ( i < n )
        { 
            local = TRUE;
            cutoff = control->nonb_cut;
            ++num_cm_entries;
//            ihb = sbp_i->p_hbond;
        }   
        else
        {
            local = FALSE;
            cutoff = control->bond_cut;
//            ihb = NON_H_BONDING_ATOM; 
        } 

        ihb = NON_H_BONDING_ATOM; 

        for ( pj = start_i; pj < end_i; ++pj )
        { 
            nbr_pj = &far_nbrs.far_nbr_list[pj];
            j = nbr_pj->nbr;
            atom_j = &my_atoms[j];

            if ( nbr_pj->d <= control->nonb_cut )
            {
                type_j = my_atoms[j].type;
                sbp_j = &sbp[type_j];
                ihb = sbp_i->p_hbond;
                jhb = sbp_j->p_hbond;

                if ( local == TRUE )
                {
                    if ( i < j && (j < n || atom_i->orig_id < atom_j->orig_id) )
                    {
                        ++num_cm_entries;
                    }
                    else if ( i > j && (j < n || atom_j->orig_id > atom_i->orig_id) )
                    {
                        ++num_cm_entries;
                    }
                }
                else
                {
                    if ( i > j && j < n && atom_j->orig_id < atom_i->orig_id )
                    {
                        ++num_cm_entries;
                    }
                }

                /* atom i: H bonding, ghost
                 * atom j: H atom, native */
                if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut 
                        && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n )
                {
                    ++num_hbonds;
                }

//                if ( i >= n )
//                {
//                    ihb = NON_H_BONDING_ATOM;
//                }
            }

            if ( nbr_pj->d <= cutoff )
            {
                type_j = my_atoms[j].type;
                r_ij = nbr_pj->d;
                sbp_j = &sbp[type_j];
                twbp = &tbp[ index_tbp(type_i ,type_j, num_atom_types) ];

                if ( local == TRUE )
                {
                    /* atom i: H atom OR H bonding atom, native */
                    if ( control->hbond_cut > 0.0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM)
                            && nbr_pj->d <= control->hbond_cut )
                    {
                        jhb = sbp_j->p_hbond;

                        /* atom i: H atom, native
                         * atom j: H bonding atom */
                        if( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                        {
                            ++num_hbonds;
                        }
                        /* atom i: H bonding atom, native
                         * atom j: H atom, native */
                        else if( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
                        {
                            ++num_hbonds;
                        }
                    }
                }

                /* uncorrected bond orders */
                if ( nbr_pj->d <= control->bond_cut )
                {
                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
                    {
                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                    }
                    else
                    {
                        C12 = 0.0;
                        BO_s = 0.0;
                    }

                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
                    {
                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                        BO_pi = EXP( C34 );
                    }
                    else
                    {
                        C34 = 0.0;
                        BO_pi = 0.0;
                    }

                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
                    {
                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
                        BO_pi2= EXP( C56 );
                    }
                    else
                    {
                        C56 = 0.0;
                        BO_pi2 = 0.0;
                    }

                    /* initially BO values are the uncorrected ones, page 1 */
                    BO = BO_s + BO_pi + BO_pi2;

                    if ( BO >= control->bo_cut )
                    {
                        ++num_bonds;
                    }
                }
            }
        }
    }

    bonds[i] = num_bonds;
    max_bonds[i] = MAX( (int)(num_bonds * 2), MIN_BONDS );

    hbonds[i] = num_hbonds;
    max_hbonds[i] = MAX( (int)(num_hbonds * SAFE_ZONE), MIN_HBONDS );

    cm_entries[i] = num_cm_entries;
    max_cm_entries[i] = MAX( (int)(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
}


void Cuda_Estimate_Storages( reax_system *system, control_params *control, 
        reax_list **lists, int realloc_bonds, int realloc_hbonds, int realloc_cm,
        int step )
{
    int blocks;

    blocks = system->total_cap / ST_BLOCK_SIZE + 
        (((system->total_cap % ST_BLOCK_SIZE == 0)) ? 0 : 1);

    k_estimate_storages <<< blocks, ST_BLOCK_SIZE >>>
        ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
          (control_params *)control->d_control_params,
          *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
          system->n, system->N, system->total_cap,
          system->d_cm_entries, system->d_max_cm_entries,
          system->d_bonds, system->d_max_bonds,
          system->d_hbonds, system->d_max_hbonds );
    cudaDeviceSynchronize( );
    cudaCheckError( );

    if ( realloc_bonds == TRUE )
    {
        Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds,
                system->total_cap );
        copy_host_device( &system->total_bonds, system->d_total_bonds, sizeof(int), 
                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" );
    }

    if ( system->numH > 0 && control->hbond_cut > 0.0 )
    {
        if ( realloc_hbonds == TRUE )
        {
            Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
                    system->total_cap );
            copy_host_device( &system->total_hbonds, system->d_total_hbonds, sizeof(int), 
                    cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
        }
    }
    else
    {
        if ( step == 0 )
        {
#if defined(DEBUG_FOCUS)
            if ( system->numH == 0 )
            {
                fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: NO HYDROGEN ATOMS FOUND\n" );
            }
#endif

#if defined(DEBUG_FOCUS)
            if ( control->hbond_cut <= 0.0 )
            {
                fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: BOND CUTOFF LENGTH IS ZERO\n" );
            }
#endif

            control->hbond_cut = 0.0;
            k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params );
        }
    }

    if ( realloc_cm == TRUE )
    {
        Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, system->total_cap );
        copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int),
                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
    }

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p:%d -->\n", system->my_rank );
    fprintf( stderr, " TOTAL DEVICE BOND COUNT: %d \n", system->total_bonds );
    fprintf( stderr, " TOTAL DEVICE HBOND COUNT: %d \n", system->total_hbonds );
    fprintf( stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", system->total_cm_entries );
#endif
}


int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *control, 
        int step, reax_list **lists, int *thbody )
{
    int ret;

    ret = SUCCESS;

    cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "scratch::thbody" );

    Estimate_Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
        ( system->d_my_atoms, (control_params *)control->d_control_params, 
          *(lists[BONDS]), system->n, system->N, thbody );
    cudaDeviceSynchronize( );
    cudaCheckError( );

    Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds );

    copy_host_device( &system->total_thbodies, system->d_total_thbodies, sizeof(int),
            cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" );

    if ( step == 0 )
    {
        system->total_thbodies = MAX( (int)(system->total_thbodies * SAFE_ZONE), MIN_3BODIES );
        system->total_thbodies_indices = system->total_bonds;

        /* create Three-body list */
        Cuda_Make_List( system->total_thbodies_indices, system->total_thbodies,
                TYP_THREE_BODY, lists[THREE_BODIES] );
    }

    if ( system->total_thbodies > lists[THREE_BODIES]->max_intrs ||
            system->total_bonds > lists[THREE_BODIES]->n )
    {
        if ( system->total_thbodies > lists[THREE_BODIES]->max_intrs )
        {
            system->total_thbodies = MAX( (int)(lists[THREE_BODIES]->max_intrs * SAFE_ZONE),
                    system->total_thbodies );
        }
        if ( system->total_bonds > lists[THREE_BODIES]->n )
        {
            system->total_thbodies_indices = MAX( (int)(lists[THREE_BODIES]->n * SAFE_ZONE),
                    system->total_bonds );
        }

        dev_workspace->realloc.thbody = TRUE;
        ret = FAILURE;
    }

    return ret;
}


CUDA_DEVICE real Init_Charge_Matrix_Entry( single_body_parameters *sbp_i, real *ctap,
        control_params *control, int i, int j, real r_ij, real gamma, MATRIX_ENTRY_POSITION pos )
{
    real taper, dr3gamij_1, dr3gamij_3, ret;

    ret = 0.0;

    switch ( control->charge_method )
    {
    case QEQ_CM:
    case EE_CM:
    case ACKS2_CM:
        switch ( pos )
        {
            case OFF_DIAGONAL:
                taper = ctap[7] * r_ij + ctap[6];
                taper = taper * r_ij + ctap[5];
                taper = taper * r_ij + ctap[4];
                taper = taper * r_ij + ctap[3];
                taper = taper * r_ij + ctap[2];
                taper = taper * r_ij + ctap[1];
                taper = taper * r_ij + ctap[0];    

                /* shielding */
                dr3gamij_1 = r_ij * r_ij * r_ij + gamma;
                dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );

                //TODO: investigate why conditional is excluded (OpenMP code below)
//                ret = ((i == j) ? 0.5 : 1.0) * Tap * EV_to_KCALpMOL / dr3gamij_3;
                ret = taper * EV_to_KCALpMOL / dr3gamij_3;
                break;

            case DIAGONAL:
                ret = sbp_i->eta;
                break;

            default:
//                fprintf( stderr, "[ERROR] Invalid matrix position. Terminating...\n" );
//                exit( INVALID_INPUT );
                break;
        }
        break;


    default:
//        fprintf( stderr, "[ERROR] Invalid charge method. Terminating...\n" );
//        exit( INVALID_INPUT );
        break;
    }

    return ret;
}


CUDA_DEVICE real Init_Charge_Matrix_Entry_Tab( LR_lookup_table *t_LR, real r_ij,
        int ti, int tj, int num_atom_types )
{
    int r, tmin, tmax;
    real val, dif, base;
    LR_lookup_table *t; 

    tmin = MIN( ti, tj );
    tmax = MAX( ti, tj );
    t = &t_LR[ index_lr(tmin,tmax, num_atom_types) ];

    /* cubic spline interpolation */
    r = (int)(r_ij * t->inv_dx);
    if ( r == 0 )
    {
        ++r;
    }
    base = (real)(r + 1) * t->dx;
    dif = r_ij - base;
    val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif
        + t->ele[r].a;
    val *= EV_to_KCALpMOL / C_ele;

    return val;
}


//static void Init_Charge_Matrix_Remaining_Entries( reax_system *system,
//        control_params *control, reax_list *far_nbrs,
//        sparse_matrix * H, sparse_matrix * H_sp,
//        int * Htop, int * H_sp_top )
//{
//    int i, j, pj;
//    real d, xcut, bond_softness, * X_diag;
//
//    switch ( control->charge_method )
//    {
//        case QEQ_CM:
//            break;
//
//        case EE_CM:
//            H->start[system->N_cm - 1] = *Htop;
//            H_sp->start[system->N_cm - 1] = *H_sp_top;
//
//            for ( i = 0; i < system->N_cm - 1; ++i )
//            {
//                H->j[*Htop] = i;
//                H->val[*Htop] = 1.0;
//                *Htop = *Htop + 1;
//
//                H_sp->j[*H_sp_top] = i;
//                H_sp->val[*H_sp_top] = 1.0;
//                *H_sp_top = *H_sp_top + 1;
//            }
//
//            H->j[*Htop] = system->N_cm - 1;
//            H->val[*Htop] = 0.0;
//            *Htop = *Htop + 1;
//
//            H_sp->j[*H_sp_top] = system->N_cm - 1;
//            H_sp->val[*H_sp_top] = 0.0;
//            *H_sp_top = *H_sp_top + 1;
//            break;
//
//        case ACKS2_CM:
//            X_diag = (real*) scalloc( system->N, sizeof(real),
//                    "Init_Charge_Matrix_Remaining_Entries::X_diag" );
//
//            H->start[system->N] = *Htop;
//            H_sp->start[system->N] = *H_sp_top;
//
//            for ( i = 0; i < system->N; ++i )
//            {
//                H->j[*Htop] = i;
//                H->val[*Htop] = 1.0;
//                *Htop = *Htop + 1;
//
//                H_sp->j[*H_sp_top] = i;
//                H_sp->val[*H_sp_top] = 1.0;
//                *H_sp_top = *H_sp_top + 1;
//            }
//
//            H->j[*Htop] = system->N;
//            H->val[*Htop] = 0.0;
//            *Htop = *Htop + 1;
//
//            H_sp->j[*H_sp_top] = system->N;
//            H_sp->val[*H_sp_top] = 0.0;
//            *H_sp_top = *H_sp_top + 1;
//
//            for ( i = 0; i < system->N; ++i )
//            {
//                H->start[system->N + i + 1] = *Htop;
//                H_sp->start[system->N + i + 1] = *H_sp_top;
//
//                H->j[*Htop] = i;
//                H->val[*Htop] = 1.0;
//                *Htop = *Htop + 1;
//
//                H_sp->j[*H_sp_top] = i;
//                H_sp->val[*H_sp_top] = 1.0;
//                *H_sp_top = *H_sp_top + 1;
//
//                for ( pj = Start_Index(i, far_nbrs); pj < End_Index(i, far_nbrs); ++pj )
//                {
//                    if ( far_nbrs->far_nbr_list[pj].d <= control->r_cut )
//                    {
//                        j = far_nbrs->far_nbr_list[pj].nbr;
//
//                        xcut = ( system->reaxprm.sbp[ system->atoms[i].type ].b_s_acks2
//                                + system->reaxprm.sbp[ system->atoms[j].type ].b_s_acks2 )
//                            / 2.0;
//
//                        if ( far_nbrs->far_nbr_list[pj].d < xcut &&
//                                far_nbrs->far_nbr_list[pj].d > 0.001 )
//                        {
//                            d = far_nbrs->far_nbr_list[pj].d / xcut;
//                            bond_softness = system->reaxprm.gp.l[34] * POW( d, 3.0 ) * POW( 1.0 - d, 6.0 );
//
//                            H->j[*Htop] = system->N + j + 1;
//                            H->val[*Htop] = MAX( 0.0, bond_softness );
//                            *Htop = *Htop + 1;
//
//                            H_sp->j[*H_sp_top] = system->N + j + 1;
//                            H_sp->val[*H_sp_top] = MAX( 0.0, bond_softness );
//                            *H_sp_top = *H_sp_top + 1;
//
//                            X_diag[i] -= bond_softness;
//                            X_diag[j] -= bond_softness;
//                        }
//                    }
//                }
//
//                H->j[*Htop] = system->N + i + 1;
//                H->val[*Htop] = 0.0;
//                *Htop = *Htop + 1;
//
//                H_sp->j[*H_sp_top] = system->N + i + 1;
//                H_sp->val[*H_sp_top] = 0.0;
//                *H_sp_top = *H_sp_top + 1;
//            }
//
//            H->start[system->N_cm - 1] = *Htop;
//            H_sp->start[system->N_cm - 1] = *H_sp_top;
//
//            for ( i = system->N + 1; i < system->N_cm - 1; ++i )
//            {
//                for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
//                {
//                    if ( H->j[pj] == i )
//                    {
//                        H->val[pj] = X_diag[i - system->N - 1];
//                        break;
//                    }
//                }
//
//                for ( pj = H_sp->start[i]; pj < H_sp->start[i + 1]; ++pj )
//                {
//                    if ( H_sp->j[pj] == i )
//                    {
//                        H_sp->val[pj] = X_diag[i - system->N - 1];
//                        break;
//                    }
//                }
//            }
//
//            for ( i = system->N + 1; i < system->N_cm - 1; ++i )
//            {
//                H->j[*Htop] = i;
//                H->val[*Htop] = 1.0;
//                *Htop = *Htop + 1;
//
//                H_sp->j[*H_sp_top] = i;
//                H_sp->val[*H_sp_top] = 1.0;
//                *H_sp_top = *H_sp_top + 1;
//            }
//
//            H->j[*Htop] = system->N_cm - 1;
//            H->val[*Htop] = 0.0;
//            *Htop = *Htop + 1;
//
//            H_sp->j[*H_sp_top] = system->N_cm - 1;
//            H_sp->val[*H_sp_top] = 0.0;
//            *H_sp_top = *H_sp_top + 1;
//
//            sfree( X_diag, "Init_Charge_Matrix_Remaining_Entries::X_diag" );
//            break;
//
//        default:
//            break;
//    }
//}


CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters *sbp, 
        control_params *control, reax_list hbonds, int N )
{
    int i;
    int type_i;
    int ihb, ihb_top;
    single_body_parameters *sbp_i;
    reax_atom *atom_i;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

    atom_i = &my_atoms[i];
    type_i = atom_i->type;
    sbp_i = &sbp[type_i];

    if ( control->hbond_cut > 0.0 )
    {
        ihb = sbp_i->p_hbond;

        if ( ihb == H_ATOM  || ihb == H_BONDING_ATOM )
        {
            ihb_top = Cuda_Start_Index( atom_i->Hindex, &hbonds );
        }
        else
        {
            ihb_top = -1;
        }
    }

    printf( "atom %6d: ihb = %2d, ihb_top = %2d\n", i, ihb, ihb_top );
}


CUDA_GLOBAL void k_init_forces( reax_atom *my_atoms, single_body_parameters *sbp, 
        two_body_parameters *tbp, storage workspace, control_params *control, 
        reax_list far_nbrs_list, reax_list bonds_list, reax_list hbonds_list, 
        LR_lookup_table *t_LR, int n, int N, int num_atom_types, int renbr,
        int *max_cm_entries, int *realloc_cm_entries,
        int *max_bonds, int *realloc_bonds,
        int *max_hbonds, int *realloc_hbonds )
{
    int i, j, pj;
    int start_i, end_i;
    int type_i, type_j;
    int Htop, btop_i, ihb, jhb, ihb_top;
    int num_bonds, num_hbonds, num_cm_entries;
    int local, flag, flag2, flag3;
    real r_ij, cutoff;
    single_body_parameters *sbp_i, *sbp_j;
    two_body_parameters *twbp;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;
    sparse_matrix *H;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

    H = &workspace.H;
    Htop = H->start[i];

    atom_i = &my_atoms[i];
    type_i = atom_i->type;
    start_i = Cuda_Start_Index( i, &far_nbrs_list );
    end_i = Cuda_End_Index( i, &far_nbrs_list );
    btop_i = Cuda_Start_Index( i, &bonds_list );
    sbp_i = &sbp[type_i];

    if ( i < n )
    {
        local = TRUE;
        cutoff = control->nonb_cut;

        //update bond mark here
        workspace.bond_mark[i] = 0;
    }
    else
    {
        local = FALSE;
        cutoff = control->bond_cut;

        //update bond mark here
        workspace.bond_mark[i] = 1000;
    }

    ihb = NON_H_BONDING_ATOM;
    ihb_top = -1;

    if ( local == TRUE )
    {
        H->entries[Htop].j = i;
        H->entries[Htop].val = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control,
                i, H->entries[Htop].j, 0.0, 0.0, DIAGONAL );
        ++Htop;
    }

    if ( control->hbond_cut > 0.0 )
    {
        ihb = sbp_i->p_hbond;

        if ( ihb == H_ATOM || ihb == H_BONDING_ATOM )
        {
            ihb_top = Cuda_Start_Index( atom_i->Hindex, &hbonds_list );
        }
        else
        {
            ihb_top = -1;
        }
    }

    /* update i-j distance - check if j is within cutoff */
    for ( pj = start_i; pj < end_i; ++pj )
    {
        nbr_pj = &far_nbrs_list.far_nbr_list[pj];
        j = nbr_pj->nbr;
        atom_j = &my_atoms[j];

        if ( renbr )
        {
            if ( nbr_pj->d <= cutoff )
            {
                flag = TRUE;
            }
            else
            {
                flag = FALSE;
            }

            if ( nbr_pj->d <= control->nonb_cut )
            {
                flag2 = TRUE;
            }
            else
            {
                flag2 = FALSE;
            }

        }
        else
        {
            if ( i < j )
            {
                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
            }
            else
            {
                nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
                nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
                nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
            }

            if ( nbr_pj->d <= SQR( control->nonb_cut ) )
            {
                flag2 = TRUE;
            }
            else
            {
                flag2 = FALSE;
            }

            if ( nbr_pj->d <= SQR( control->nonb_cut ) )
            {
                nbr_pj->d = SQRT( nbr_pj->d );
                flag = TRUE;
            }
            else
            {
                flag = FALSE;
            }
        }
        if ( flag2 == TRUE )
        {
            type_j = atom_j->type;
            sbp_j = &sbp[type_j];
            ihb = sbp_i->p_hbond;
            jhb = sbp_j->p_hbond;

            /* atom i: H bonding, ghost
             * atom j: H atom, native */
            if ( control->hbond_cut > 0.0 && nbr_pj->d <= control->hbond_cut
                    && ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n ) 
            {
                hbonds_list.hbond_list[ihb_top].nbr = j;
                hbonds_list.hbond_list[ihb_top].scl = -1;
                hbonds_list.hbond_list[ihb_top].ptr = nbr_pj;

                //CUDA SPECIFIC
                hbonds_list.hbond_list[ihb_top].sym_index = -1;
                rvec_MakeZero( hbonds_list.hbond_list[ihb_top].hb_f );

                ++ihb_top;
            }

            //if ((i < n) || (j < n))
            //if (local == TRUE || ((i >= n) &&(j < n)))

            flag3 = FALSE;
            if ( i < j && i < n && (j < n || atom_i->orig_id < atom_j->orig_id) )
            {
                flag3 = TRUE;
            }
            else if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id )
            {
                flag3 = TRUE;
            }
            else if ( i > j && i < n && (j < n || atom_j->orig_id < atom_i->orig_id ) )
            {
                flag3 = TRUE;
            }

            if ( flag3 == TRUE )
            {
                twbp = &tbp[ index_tbp(type_i,type_j,num_atom_types) ];
                r_ij = nbr_pj->d;

                //if (renbr) {
                H->entries[Htop].j = j;
                if ( control->tabulate == 0 )
                {
                    H->entries[Htop].val = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
                            control, i, H->entries[Htop].j, r_ij, twbp->gamma, OFF_DIAGONAL );
                }
                else
                {
                    H->entries[Htop].val = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij, type_i, type_j,num_atom_types );
                }
                //}
                ++Htop;
            }
        }

        if ( flag == TRUE )
        {
            type_j = atom_j->type;
            r_ij = nbr_pj->d;
            sbp_j = &sbp[type_j];
            twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];

            if ( local == TRUE )
            {
                /* H matrix entry */
//                if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1
//                    H->entries[Htop].j = j;
//                    if( control->tabulate == 0 )
//                            H->entries[Htop].val = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
//                                    control, i, H->entries[Htop].j, r_ij, twbp->gamma, OFF_DIAGONAL );
//                    else
//                        H->entries[Htop].val = Init_Charge_Matrix_Entry_Tab(t_LR, r_ij, type_i, type_j,num_atom_types);
//                    ++Htop;
//                } 
//                else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1
//                    H->entries[Htop].j = j;
//                    if( control->tabulate == 0 )
//                            H->entries[Htop].val = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
//                                    control, i, H->entries[Htop].j, r_ij, twbp->gamma, OFF_DIAGONAL );
//                    else
//                        H->entries[Htop].val = Init_Charge_Matrix_Entry_Tab(t_LR, r_ij, type_i, type_j,num_atom_types);
//                    ++Htop;
//                } 
                //bool condition = !((i >= n) && (j >= n));

                /* hydrogen bond lists */
                if ( control->hbond_cut > 0.0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) &&
                        nbr_pj->d <= control->hbond_cut )
                {
                    jhb = sbp_j->p_hbond;

                    /* atom i: H atom, native
                     * atom j: H bonding atom */
                    if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
                    {
                        hbonds_list.hbond_list[ihb_top].nbr = j;

                        if ( i < j )
                        {
                            hbonds_list.hbond_list[ihb_top].scl = 1;
                        }
                        else
                        {
                            hbonds_list.hbond_list[ihb_top].scl = -1;
                        }
                        hbonds_list.hbond_list[ihb_top].ptr = nbr_pj;

                        //CUDA SPECIFIC
                        hbonds_list.hbond_list[ihb_top].sym_index = -1;
                        rvec_MakeZero( hbonds_list.hbond_list[ihb_top].hb_f );

                        ++ihb_top;
                    }
                    /* atom i: H bonding atom, native
                     * atom j: H atom, native */
                    else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
                    {
                        //jhb_top = End_Index( atom_j->Hindex, hbonds_list );
                        hbonds_list.hbond_list[ihb_top].nbr = j;
                        hbonds_list.hbond_list[ihb_top].scl = -1;
                        hbonds_list.hbond_list[ihb_top].ptr = nbr_pj;

                        //CUDA SPECIFIC
                        hbonds_list.hbond_list[ihb_top].sym_index = -1;
                        rvec_MakeZero( hbonds_list.hbond_list[ihb_top].hb_f );

                        ++ihb_top;
                    }
                }
            }

            /* uncorrected bond orders */
            if ( nbr_pj->d <= control->bond_cut &&
                    Cuda_BOp( bonds_list, control->bo_cut, i, btop_i, nbr_pj,
                        sbp_i, sbp_j, twbp, workspace.dDeltap_self,
                        workspace.total_bond_order ) == TRUE )
            {
                ++btop_i;

                /* TODO: Need to do later... since i and j are parallel */
//                if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
//                {
//                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
//                }
//                else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
//                {
//                    workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
//                }
            }
        }
    }

    Cuda_Set_End_Index( i, btop_i, &bonds_list );
    H->end[i] = Htop;
//    if( local == TRUE )
//    {
        if ( control->hbond_cut > 0.0 && ihb_top > 0 && (ihb == H_ATOM || ihb == H_BONDING_ATOM) )
        {
            Cuda_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds_list );
        }
//    }

    num_bonds = btop_i - Cuda_Start_Index( i, &bonds_list );
    num_hbonds = ihb_top - Cuda_Start_Index( atom_i->Hindex, &hbonds_list );
    num_cm_entries = Htop - H->start[i];

    /* copy (h)bond info to atom structure
     * (needed for atom ownership transfer via MPI) */
    my_atoms[i].num_bonds = num_bonds;
    my_atoms[i].num_hbonds = num_hbonds;

    /* reallocation checks */
    if ( num_bonds > max_bonds[i] )
    {
        *realloc_bonds = TRUE;
    }

    if ( num_hbonds > max_hbonds[i] )
    {
        *realloc_hbonds = TRUE;
    }

    if ( num_cm_entries > max_cm_entries[i] )
    {
        *realloc_cm_entries = TRUE;
    }
}


CUDA_GLOBAL void k_init_bond_mark( int offset, int n, int *bond_mark )
{
    int i;

    i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n)
    {
        return;
    }

    bond_mark[offset + threadIdx.x] = 1000;
}


CUDA_GLOBAL void New_fix_sym_dbond_indices( reax_list pbonds, int N )
{
    int i, j, k, nbr;
    bond_data *ibond, *jbond;
    int atom_j;
    reax_list *bonds;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

    bonds = &pbonds;

    for ( j = Cuda_Start_Index(i, bonds); j < Cuda_End_Index(i, bonds); j++ )
    {
        ibond = &bonds->bond_list[j];
        nbr = ibond->nbr;

        for ( k = Cuda_Start_Index(nbr, bonds); k < Cuda_End_Index(nbr, bonds); k++ )
        {
            jbond = &bonds->bond_list[k];
            atom_j = jbond->nbr;

            if ( atom_j == i )
            {
                if ( i > nbr )
                {
                    ibond->dbond_index = j;
                    jbond->dbond_index = j;

                    ibond->sym_index = k;
                    jbond->sym_index = j;
                }
            }
        }
    }
}


CUDA_GLOBAL void New_fix_sym_hbond_indices( reax_atom *my_atoms, reax_list hbonds, int N )
{
    int i, j, k;
    int nbr, nbrstart, nbrend;
    int start, end;
    hbond_data *ihbond, *jhbond;
    int __THREADS_PER_ATOM__;
    int thread_id;
    int warp_id;
    int lane_id;

    __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
    thread_id = blockIdx.x * blockDim.x + threadIdx.x;
    warp_id = thread_id / __THREADS_PER_ATOM__;

    if ( warp_id > N )
    {
        return;
    }

    lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
    i = warp_id;
    start = Cuda_Start_Index( my_atoms[i].Hindex, &hbonds );
    end = Cuda_End_Index( my_atoms[i].Hindex, &hbonds );
    j = start + lane_id;

    while ( j < end )
    {
        ihbond = &hbonds.hbond_list[j];
        nbr = ihbond->nbr;

        nbrstart = Cuda_Start_Index( my_atoms[nbr].Hindex, &hbonds );
        nbrend = Cuda_End_Index( my_atoms[nbr].Hindex, &hbonds );

        for ( k = nbrstart; k < nbrend; k++ )
        {
            jhbond = &hbonds.hbond_list[k];

            if ( jhbond->nbr == i )
            {
                ihbond->sym_index = k;
                jhbond->sym_index = j;
                break;
            }
        }

        j += __THREADS_PER_ATOM__;
    }
}


CUDA_GLOBAL void k_update_bonds( reax_atom *my_atoms, reax_list bonds, int n )
{
    int i;
    
    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= n )
    {
        return;
    }

    my_atoms[i].num_bonds = Cuda_Num_Entries( i, &bonds );
}


CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n )
{
    int Hindex;
    int i;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= n )
    {
        return;
    }

    Hindex = my_atoms[i].Hindex;
    my_atoms[i].num_hbonds = Cuda_Num_Entries( Hindex, &hbonds );
}


#if defined(DEBUG_FOCUS)
CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
{
    int i; 

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= n )
    {
        return;
    }

    printf( "%8d: %24.15f, %24.15f, %24.15f\n",
            my_atoms[i].orig_id,
            f[i][0],
            f[i][1],
            f[i][2] );
}


static void Print_Forces( reax_system *system )
{
    int blocks;
    
    blocks = (system->n) / DEF_BLOCK_SIZE + 
        (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);

    k_print_forces <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_my_atoms, dev_workspace->f, system->n );
    cudaDeviceSynchronize( );
    cudaCheckError( );
}


CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list p_hbonds, int n, int rank, int step )
{
    int i, k, pj, start, end; 
    reax_list *hbonds;
    hbond_data *hbond_list, *hbond_jk;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= n )
    {
        return;
    }

    hbonds = &p_hbonds;
    start = Cuda_Start_Index( my_atoms[i].Hindex, hbonds );
    end = Cuda_End_Index( my_atoms[i].Hindex, hbonds );
    hbond_list = hbonds->hbond_list;

    for ( pj = start; pj < end; ++pj )
    {
        k = hbond_list[pj].nbr;
        hbond_jk = &hbond_list[pj];

        printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n",
                rank, step, my_atoms[i].Hindex, k,
                hbond_jk->hb_f[0],
                hbond_jk->hb_f[1],
                hbond_jk->hb_f[2] );
    }
}


static void Print_HBonds( reax_system *system, int step )
{
    int blocks;
    
    blocks = (system->n) / DEF_BLOCK_SIZE + 
        (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);

    k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_my_atoms, *(lists[HBONDS]), system->n, system->my_rank, step );
    cudaDeviceSynchronize( );
    cudaCheckError( );
}
#endif


CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, 
        reax_list bonds, real *total_bond_order, int N )
{
    int i, pj; 
//    int j; 
    int start_i, end_i;
//    far_neighbor_data *nbr_pj;
//    reax_atom *atom_i, *atom_j;

    i = blockIdx.x * blockDim.x + threadIdx.x;

    if ( i >= N )
    {
        return;
    }

//    atom_i = &my_atoms[i];
    start_i = Cuda_Start_Index(i, &far_nbrs);
    end_i = Cuda_End_Index(i, &far_nbrs);

    for( pj = start_i; pj < end_i; ++pj )
    { 
//        nbr_pj = &far_nbrs.far_nbr_list[pj];
//        j = nbr_pj->nbr;
//        atom_j = &my_atoms[j];
//
//        total_bond_order[i]++;
//        atom_i->Hindex++;
    }
}


CUDA_GLOBAL void k_bond_mark( reax_list p_bonds, storage p_workspace, int N )
{
    int i, j, k;
    reax_list *bonds = &p_bonds;
    storage *workspace = &p_workspace;

//    int i = blockIdx.x * blockDim.x + threadIdx.x;
//    if ( i >= N )
//    {
//        return;
//    }

    for ( i = 0; i < N; i++ )
    {
        for (k = Cuda_Start_Index(i, bonds); k < Cuda_End_Index(i, bonds); k++)
        {
            bond_data *bdata = &bonds->bond_list[k];
            j = bdata->nbr;

            if (i < j )
            {
                if ( workspace->bond_mark[j] > (workspace->bond_mark[i] + 1) )
                {
                    workspace->bond_mark[j] = workspace->bond_mark[i] + 1;    
                }
                else if ( workspace->bond_mark[i] > (workspace->bond_mark[j] + 1) )
                {
                    workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
                }
            }
        }
    }
}


int Cuda_Init_Forces( reax_system *system, control_params *control,
        simulation_data *data, storage *workspace,
        reax_list **lists, output_controls *out_control ) 
{
    int ret, ret_bonds, ret_hbonds, ret_cm;
    int blocks, hblocks;

    /* init the workspace (bond_mark) */
//    cuda_memset( dev_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" );
//
//    blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
//       (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
//    k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
//       ( system->n, (system->N - system->n), dev_workspace->bond_mark );
//    cudaDeviceSynchronize( );
//    cudaCheckError( );

    blocks = (system->N) / DEF_BLOCK_SIZE + 
        (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);

//    k_init_bond_orders <<< blocks, DEF_BLOCK_SIZE >>>
//        ( system->d_my_atoms, *(lists[FAR_NBRS]), *(lists[BONDS]),
//          dev_workspace->total_bond_order, system->N );
//    cudaDeviceSynchronize( );
//    cudaCheckError( );
//
//    k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>>
//        ( system->d_my_atoms, system->reax_param.d_sbp,
//          (control_params *)control->d_control_params,
//          *(lists[HBONDS]), system->N );
//    cudaDeviceSynchronize( );
//    cudaCheckError( );

    /* reset reallocation flags on device */
    cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int), 
            "Cuda_Init_Forces::d_realloc_bonds" );
    cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int), 
            "Cuda_Init_Forces::d_realloc_hbonds" );
    cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int), 
            "Cuda_Init_Forces::d_realloc_cm_entries" );

    k_init_forces <<< blocks, DEF_BLOCK_SIZE >>>
        ( system->d_my_atoms, system->reax_param.d_sbp,
          system->reax_param.d_tbp, *dev_workspace,
          (control_params *)control->d_control_params,
          *(lists[FAR_NBRS]), *(lists[BONDS]),
          *(lists[HBONDS]), workspace->d_LR, system->n,
          system->N, system->reax_param.num_atom_types,
          (((data->step-data->prev_steps) % control->reneighbor) == 0),
          system->d_max_cm_entries, system->d_realloc_cm_entries,
          system->d_max_bonds, system->d_realloc_bonds,
          system->d_max_hbonds, system->d_realloc_hbonds );
    cudaDeviceSynchronize( );
    cudaCheckError( );

    /* check reallocation flags on device */
    copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int), 
            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
    copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int), 
            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
    copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int), 
            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" );

    ret = (ret_bonds == FALSE && ret_hbonds == FALSE && ret_cm == FALSE) ? SUCCESS : FAILURE;

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "[INFO] p%d, step %d: ret = %d, ret_bonds = %d, ret_hbonds = %d, ret_cm = %d\n",
            system->my_rank, data->step, ret, ret_bonds, ret_hbonds, ret_cm );
#endif

    if ( ret == SUCCESS )
    {
        /* fix sym_index and dbond_index */
        New_fix_sym_dbond_indices <<< blocks, BLOCK_SIZE >>> 
            ( *(lists[BONDS]), system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        if ( control->hbond_cut > 0.0 && system->numH > 0 )
        {
            /* make hbond_list symmetric */
            hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM / HB_SYM_BLOCK_SIZE) + 
                ((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1);

            New_fix_sym_hbond_indices <<< hblocks, HB_BLOCK_SIZE >>>
                ( system->d_my_atoms, *(lists[HBONDS]), system->N );
            cudaDeviceSynchronize( );
            cudaCheckError( );
        }

        /* update bond_mark */
//        k_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
//        k_bond_mark <<< 1, 1 >>>
//            ( *(lists[BONDS]), *dev_workspace, system->N );
//        cudaDeviceSynchronize( );
//        cudaCheckError( );
    }
    else
    {
        Cuda_Estimate_Storages( system, control, lists,
               ret_bonds, ret_hbonds, ret_cm, data->step );

        dev_workspace->realloc.bonds = ret_bonds;
        dev_workspace->realloc.hbonds = ret_hbonds;
        dev_workspace->realloc.cm = ret_cm;
    }

    return ret;
}


int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
        simulation_data *data, storage *workspace,
        reax_list **lists, output_controls *out_control ) 
{
    //TODO: implement later when figure out bond_mark usage
    return FAILURE;
}


int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, 
        simulation_data *data, storage *workspace, 
        reax_list **lists, output_controls *out_control )
{
    int update_energy, ret;
//    int hbs, hnbrs_blocks;
    int *thbody;
    static int compute_bonded_part1 = FALSE;
    real *spad = (real *) scratch;
    rvec *rvec_spad;
#if defined(DEBUG_FOCUS)
    real t_start, t_elapsed;
#endif

    update_energy = (out_control->energy_update_freq > 0
            && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
    ret = SUCCESS;

    if ( compute_bonded_part1 == FALSE )
    {
        /* 1. Bond Order Interactions */
#if defined(DEBUG_FOCUS)
        t_start = Get_Time( );

        fprintf( stderr, " Begin Bonded Forces ... %d x %d\n",
                BLOCKS_N, BLOCK_SIZE );
#endif

        Cuda_Calculate_BO_init <<< BLOCKS_N, BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_sbp, 
              *dev_workspace, system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
              system->reax_param.d_tbp, *dev_workspace, 
              *(lists[BONDS]),
              system->reax_param.num_atom_types, system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        Cuda_Update_Uncorrected_BO <<< BLOCKS_N, BLOCK_SIZE >>>
            ( *dev_workspace, *(lists[BONDS]), system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        Cuda_Update_Workspace_After_BO <<< BLOCKS_N, BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
             *dev_workspace, system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

#if defined(DEBUG_FOCUS)
        t_elapsed = Get_Timing_Info( t_start );

        fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n",
                cudaGetLastError( ), t_elapsed );
        fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
#endif

        /* 2. Bond Energy Interactions */
#if defined(DEBUG_FOCUS)
        t_start = Get_Time( );
#endif

        cuda_memset( spad, 0, system->N * (2 * sizeof(real)) , "scratch" );

        Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof(real)* BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
              *dev_workspace, *(lists[BONDS]), 
              system->n, system->reax_param.num_atom_types, spad );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        /* reduction for E_BE */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
                    system->n );
        }

#if defined(DEBUG_FOCUS)
        t_elapsed = Get_Timing_Info( t_start );

        fprintf( stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n",
                cudaGetLastError( ), t_elapsed );
        fprintf( stderr, "Cuda_Bond_Energy Done... \n" );
#endif

        /* 3. Atom Energy Interactions */
#if defined(DEBUG_FOCUS)
        t_start = Get_Time( );
#endif

        cuda_memset( spad, 0, ( 6 * sizeof(real) * system->n ), "scratch" );

        Cuda_Atom_Energy <<< BLOCKS, BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_gp,
              system->reax_param.d_sbp, system->reax_param.d_tbp, *dev_workspace,
              *(lists[BONDS]), system->n, system->reax_param.num_atom_types,
              spad, spad + 2 * system->n, spad + 4 * system->n);
        cudaDeviceSynchronize( );
        cudaCheckError( );

//        Cuda_Atom_Energy_PostProcess <<< BLOCKS, BLOCK_SIZE >>>
//            ( *(lists[BONDS]), *dev_workspace, system->n );
        Cuda_Atom_Energy_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
            ( *(lists[BONDS]), *dev_workspace, system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        /* reduction for E_Lp */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
                    system->n );
        }

        /* reduction for E_Ov */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad + 2 * system->n,
                    &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
                    system->n );
        }

        /* reduction for E_Un */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad + 4 * system->n,
                    &((simulation_data *)data->d_simulation_data)->my_en.e_ov,
                    system->n );
        }

#if defined(DEBUG_FOCUS)
        t_elapsed = Get_Timing_Info( t_start );

        fprintf( stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n",
                cudaGetLastError( ), t_elapsed );
        fprintf( stderr, "test_LonePair_postprocess Done... \n");
#endif

        compute_bonded_part1 = TRUE;
    }

    /* 4. Valence Angles Interactions */
#if defined(DEBUG_FOCUS)
    t_start = Get_Time( );
#endif

    thbody = (int *) scratch;
    ret = Cuda_Estimate_Storage_Three_Body( system, control, data->step,
            lists, thbody );

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "system->total_thbodies = %d, lists:THREE_BODIES->max_intrs = %d,\n",
            system->total_thbodies, lists[THREE_BODIES]->max_intrs );
    fprintf( stderr, "lists:THREE_BODIES->n = %d, lists:BONDS->max_intrs = %d,\n",
            lists[THREE_BODIES]->n, lists[BONDS]->max_intrs );
    fprintf( stderr, "system->total_thbodies = %d\n", system->total_thbodies );
#endif

    if ( ret == SUCCESS )
    {
        Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices );

        cuda_memset( spad, 0, 6 * sizeof(real) * system->N + sizeof(rvec) * system->N * 2, "scratch" );

        Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_gp, 
              system->reax_param.d_sbp, system->reax_param.d_thbp, 
              (control_params *)control->d_control_params,
              *dev_workspace, *(lists[BONDS]), *(lists[THREE_BODIES]),
              system->n, system->N, system->reax_param.num_atom_types, 
              spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N) );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        /* reduction for E_Ang */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
                    system->N );
        }

        /* reduction for E_Pen */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad + 2 * system->N,
                    &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
                    system->N );
        }

        /* reduction for E_Coa */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad + 4 * system->N,
                    &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
                    system->N );
        }

        /* reduction for ext_pres */
        rvec_spad = (rvec *) (spad + 6 * system->N);
        k_reduction_rvec <<< BLOCKS_N, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
            ( rvec_spad, rvec_spad + system->N,  system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof(rvec) * BLOCKS_POW_2_N >>>
            ( rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N );
        cudaDeviceSynchronize ();
        cudaCheckError( );
//        Cuda_Reduction_Sum( rvec_spad,
//                &((simulation_data *)data->d_simulation_data)->my_ext_press,
//                system->N );

        Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
            ( system->d_my_atoms, (control_params *)control->d_control_params,
              *dev_workspace, *(lists[BONDS]), system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

#if defined(DEBUG_FOCUS)
        t_elapsed = Get_Timing_Info( t_start );

        fprintf( stderr, "Three_Body_Interactions ...  Timing %lf \n",
                t_elapsed );
        fprintf( stderr, "Three_Body_Interactions Done... \n" );
#endif

        /* 5. Torsion Angles Interactions */
#if defined(DEBUG_FOCUS)
        t_start = Get_Time( );
#endif

        cuda_memset( spad, 0, 4 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2,
                "scratch" );

        Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>>
            ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp,
              (control_params *) control->d_control_params, *(lists[BONDS]),
              *(lists[THREE_BODIES]), *dev_workspace, system->n,
              system->reax_param.num_atom_types, 
              spad, spad + 2 * system->n, (rvec *) (spad + 4 * system->n) );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        /* reduction for E_Tor */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
                    system->n );
        }

        /* reduction for E_Con */
        if ( update_energy == TRUE )
        {
            Cuda_Reduction_Sum( spad + 2 * system->n,
                    &((simulation_data *)data->d_simulation_data)->my_en.e_con,
                    system->n );
        }

        /* reduction for ext_pres */
        rvec_spad = (rvec *) (spad + 4 * system->n);
        k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
            ( rvec_spad, rvec_spad + system->n,  system->n );
        cudaDeviceSynchronize( );
        cudaCheckError( );

        k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
                ( rvec_spad + system->n,
                  &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS );
        cudaDeviceSynchronize( );
        cudaCheckError( );
//        Cuda_Reduction_Sum( rvec_spad,
//                &((simulation_data *)data->d_simulation_data)->my_ext_press,
//                system->n );

        Cuda_Torsion_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
                ( system->d_my_atoms, *dev_workspace, *(lists[BONDS]),
                  system->N );
        cudaDeviceSynchronize( );
        cudaCheckError( );

#if defined(DEBUG_FOCUS)
        t_elapsed = Get_Timing_Info( t_start );

        fprintf( stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n",
                cudaGetLastError( ), t_elapsed );
        fprintf( stderr, " Four_Body_ Done... \n");
#endif

        /* 6. Hydrogen Bonds Interactions */
        if ( control->hbond_cut > 0.0 && system->numH > 0 )
        {
#if defined(DEBUG_FOCUS)
            t_start = Get_Time( );
#endif

            cuda_memset( spad, 0,
                    2 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" );

//            hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
//                (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);

            Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>>
//            Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, 
//                    HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>>
                    ( system->d_my_atoms, system->reax_param.d_sbp,
                      system->reax_param.d_hbp, system->reax_param.d_gp,
                      (control_params *) control->d_control_params,
                      *dev_workspace, *(lists[BONDS]), *(lists[HBONDS]),
                      system->n, system->reax_param.num_atom_types,
                      spad, (rvec *) (spad + 2 * system->n), system->my_rank, data->step );
            cudaDeviceSynchronize( );
            cudaCheckError( );

//            if ( data->step == 10 )
//            {
//                Print_HBonds( system, data->step );
//            }

            /* reduction for E_HB */
            if ( update_energy == TRUE )
            {
                Cuda_Reduction_Sum( spad,
                        &((simulation_data *)data->d_simulation_data)->my_en.e_hb,
                        system->n );
            }

            /* reduction for ext_pres */
            rvec_spad = (rvec *) (spad + 2 * system->n);
            k_reduction_rvec <<< BLOCKS, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
                (rvec_spad, rvec_spad + system->n,  system->n);
            cudaDeviceSynchronize( );
            cudaCheckError( );

            k_reduction_rvec <<< 1, BLOCKS_POW_2, sizeof(rvec) * BLOCKS_POW_2 >>>
                (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
            cudaDeviceSynchronize( );
            cudaCheckError( );
//            Cuda_Reduction_Sum( rvec_spad,
//                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
//                    system->n );

            /* post process step1 */
            Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof(rvec) >>>
                ( system->d_my_atoms, *dev_workspace,
                  *(lists[BONDS]), system->N );
            cudaDeviceSynchronize( );
            cudaCheckError( );

            /* post process step2 */
//            hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
//                (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);

            Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>>
                ( system->d_my_atoms, *dev_workspace, *(lists[HBONDS]) );
//            Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, 
//                    HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
//                ( system->d_my_atoms, *dev_workspace, *(lists[HBONDS]), system->N );
            cudaDeviceSynchronize( );
            cudaCheckError( );

#if defined(DEBUG_FOCUS)
            t_elapsed = Get_Timing_Info( t_start );

            fprintf( stderr,
                    "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n",
                    cudaGetLastError( ), t_elapsed );
            fprintf( stderr, "Hydrogen_Bond Done... \n" );
#endif
        }

        compute_bonded_part1 = FALSE;
    }

    return ret;
}


void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
        simulation_data *data, storage *workspace, 
        reax_list **lists, output_controls *out_control,
        mpi_datatypes *mpi_data )
{
    /* van der Waals and Coulomb interactions */
    Cuda_NonBonded_Energy( system, control, workspace, data,
            lists, out_control, (control->tabulate == 0) ? false: true );
}


void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
        simulation_data *data, storage *workspace,
        reax_list **lists, mpi_datatypes *mpi_data )
{
    rvec *f;

    f = (rvec *) host_scratch;
    memset( f, 0, sizeof(rvec) * system->N );

    Cuda_Total_Forces( system, control, data, workspace );

#if defined(PURE_REAX)
    /* now all forces are computed to their partially-final values
     * based on the neighbors information each processor has had.
     * final values of force on each atom needs to be computed by adding up
     * all partially-final pieces */

    //MVAPICH2
    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N ,
            cudaMemcpyDeviceToHost, "total_force:f:get" );

    Coll( system, mpi_data, f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );

    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N,
            cudaMemcpyHostToDevice, "total_force:f:put" );

    Cuda_Total_Forces_PURE( system, dev_workspace );
#endif

}


int Cuda_Compute_Forces( reax_system *system, control_params *control,
        simulation_data *data, storage *workspace, reax_list **lists,
        output_controls *out_control, mpi_datatypes *mpi_data )
{
    int charge_flag, retVal;
    static int init_forces_done = FALSE;

#if defined(LOG_PERFORMANCE)
    real t_start = 0;

    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
    {
        t_start = Get_Time( );
    }
#endif

    retVal = SUCCESS;

    /********* init forces ************/
    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
    {
        charge_flag = TRUE;
    }
    else
    {
        charge_flag = FALSE;
    }

    if ( init_forces_done == FALSE )
    {
        if ( charge_flag == TRUE )
        {
            retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
        }
        else
        {
            retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
        }

        if ( retVal == SUCCESS )
        {
            init_forces_done = TRUE;
        }
    }

    if ( retVal == SUCCESS )
    {
        //validate_sparse_matrix( system, workspace );

#if defined(LOG_PERFORMANCE)
        //MPI_Barrier( MPI_COMM_WORLD );

        if ( system->my_rank == MASTER_NODE )
        {
            Update_Timing_Info( &t_start, &data->timing.init_forces );
        }
#endif

        /********* bonded interactions ************/
        retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );

#if defined(LOG_PERFORMANCE)
        //MPI_Barrier( MPI_COMM_WORLD );

        if ( system->my_rank == MASTER_NODE )
        {
            Update_Timing_Info( &t_start, &data->timing.bonded );
        }
#endif

#if defined(DEBUG_FOCUS)
        fprintf( stderr, "p%d @ step%d: completed bonded\n",
                 system->my_rank, data->step );
        MPI_Barrier( MPI_COMM_WORLD );
#endif
    }

    if ( retVal == SUCCESS )
    {
    /**************** charges ************************/
#if defined(PURE_REAX)
        if ( charge_flag == TRUE )
        {
            Cuda_Compute_Charges( system, control, data, workspace, out_control, mpi_data );
        }

#if defined(LOG_PERFORMANCE)
        //MPI_Barrier( MPI_COMM_WORLD );
        if ( system->my_rank == MASTER_NODE )
        {
            Update_Timing_Info( &t_start, &data->timing.cm );
        }
#endif

#if defined(DEBUG_FOCUS)
        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
        MPI_Barrier( MPI_COMM_WORLD );
#endif

#endif //PURE_REAX

        /********* nonbonded interactions ************/
        Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
                lists, out_control, mpi_data );

#if defined(LOG_PERFORMANCE)
        //MPI_Barrier( MPI_COMM_WORLD );

        if ( system->my_rank == MASTER_NODE )
        {
            Update_Timing_Info( &t_start, &data->timing.nonb );
        }
#endif

#if defined(DEBUG_FOCUS)
        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
                system->my_rank, data->step );
        MPI_Barrier( MPI_COMM_WORLD );
#endif

        /*********** total force ***************/
        Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );

#if defined(LOG_PERFORMANCE)
        //MPI_Barrier( MPI_COMM_WORLD );

        if ( system->my_rank == MASTER_NODE )
        {
            Update_Timing_Info( &t_start, &data->timing.bonded );
        }
#endif

#if defined(DEBUG_FOCUS)
        fprintf( stderr, "p%d @ step%d: total forces computed\n",
                system->my_rank, data->step );
//        Print_Total_Force( system, data, workspace );
        MPI_Barrier( MPI_COMM_WORLD );
#endif

        init_forces_done = FALSE;
    }

    return retVal;
}