/*----------------------------------------------------------------------
  PuReMD - Purdue ReaxFF Molecular Dynamics Program

  Copyright (2010) Purdue University
  Hasan Metin Aktulga, haktulga@cs.purdue.edu
  Joseph Fogarty, jcfogart@mail.usf.edu
  Sagar Pandit, pandit@usf.edu
  Ananth Y Grama, ayg@cs.purdue.edu

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as
  published by the Free Software Foundation; either version 2 of
  the License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  See the GNU General Public License for more details:
  <http://www.gnu.org/licenses/>.
  ----------------------------------------------------------------------*/

#include "reax_types.h"

#include "index_utils.h"
#ifdef HAVE_CUDA
#include "cuda_forces.h"

#include "cuda_linear_solvers.h"
#include "cuda_neighbors.h"
//#include "cuda_bond_orders.h"
#include "validation.h"
#endif

#if defined(PURE_REAX)
#include "forces.h"
#include "bond_orders.h"
#include "bonds.h"
#include "basic_comm.h"
#include "hydrogen_bonds.h"
#include "io_tools.h"
#include "list.h"
#include "lookup.h"
#include "multi_body.h"
#include "nonbonded.h"
#include "qEq.h"
#include "tool_box.h"
#include "torsion_angles.h"
#include "valence_angles.h"
#include "vector.h"
#elif defined(LAMMPS_REAX)
#include "reax_forces.h"
#include "reax_bond_orders.h"
#include "reax_bonds.h"
#include "reax_basic_comm.h"
#include "reax_hydrogen_bonds.h"
#include "reax_io_tools.h"
#include "reax_list.h"
#include "reax_lookup.h"
#include "reax_multi_body.h"
#include "reax_nonbonded.h"
#include "reax_tool_box.h"
#include "reax_torsion_angles.h"
#include "reax_valence_angles.h"
#include "reax_vector.h"
#endif


#ifdef HAVE_CUDA
void Cuda_Total_Forces (reax_system *, control_params *, simulation_data *, storage *);
void Cuda_Total_Forces_PURE (reax_system *, storage *);
#endif


interaction_function Interaction_Functions[NUM_INTRS];


void Dummy_Interaction( reax_system *system, control_params *control,
        simulation_data *data, storage *workspace, reax_list **lists,
        output_controls *out_control )
{
}


void Init_Force_Functions( control_params *control )
{
    Interaction_Functions[0] = BO;
    Interaction_Functions[1] = Bonds; //Dummy_Interaction;
    Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction;
    Interaction_Functions[3] = Valence_Angles; //Dummy_Interaction;
    Interaction_Functions[4] = Torsion_Angles; //Dummy_Interaction;
    if ( control->hbond_cut > 0 )
        Interaction_Functions[5] = Hydrogen_Bonds;
    else Interaction_Functions[5] = Dummy_Interaction;
    Interaction_Functions[6] = Dummy_Interaction; //empty
    Interaction_Functions[7] = Dummy_Interaction; //empty
    Interaction_Functions[8] = Dummy_Interaction; //empty
    Interaction_Functions[9] = Dummy_Interaction; //empty
}


void Compute_Bonded_Forces( reax_system *system, control_params *control,
                            simulation_data *data, storage *workspace,
                            reax_list **lists, output_controls *out_control )
{
    int i;

    /* Mark beginning of a new timestep in bonded energy files */
#if defined(TEST_ENERGY)
    Debug_Marker_Bonded( out_control, data->step );
#endif

    /* Implement all force calls as function pointers */
//  for( i = 0; i < NUM_INTRS; i++ ) {
//#if defined(DEBUG)
//    fprintf( stderr, "p%d: starting f%d\n", system->my_rank, i );
//    MPI_Barrier( MPI_COMM_WORLD );
//#endif
//    (Interaction_Functions[i])( system, control, data, workspace,
//              lists, out_control );
//#if defined(DEBUG)
//    fprintf( stderr, "p%d: f%d done\n", system->my_rank, i );
//    MPI_Barrier( MPI_COMM_WORLD );
//#endif
//  }

    (Interaction_Functions[0])( system, control, data, workspace, lists, out_control );
    (Interaction_Functions[1])( system, control, data, workspace, lists, out_control );
    (Interaction_Functions[2])( system, control, data, workspace, lists, out_control );
    (Interaction_Functions[3])( system, control, data, workspace, lists, out_control );
    (Interaction_Functions[4])( system, control, data, workspace, lists, out_control );
    (Interaction_Functions[5])( system, control, data, workspace, lists, out_control );
}


void Compute_NonBonded_Forces( reax_system *system, control_params *control,
                               simulation_data *data, storage *workspace,
                               reax_list **lists, output_controls *out_control,
                               mpi_datatypes *mpi_data )
{
    /* Mark beginning of a new timestep in nonbonded energy files */
#if defined(TEST_ENERGY)
    Debug_Marker_Nonbonded( out_control, data->step );
#endif

    /* van der Waals and Coulomb interactions */
    if ( control->tabulate == 0 )
        vdW_Coulomb_Energy( system, control, data, workspace,
                            lists, out_control );
    else
        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace,
                                      lists, out_control );

#if defined(DEBUG)
    fprintf( stderr, "p%d: nonbonded forces done\n", system->my_rank );
    MPI_Barrier( MPI_COMM_WORLD );
#endif
}



/* this version of Compute_Total_Force computes forces from
   coefficients accumulated by all interaction functions.
   Saves enormous time & space! */
void Compute_Total_Force( reax_system *system, control_params *control,
                          simulation_data *data, storage *workspace,
                          reax_list **lists, mpi_datatypes *mpi_data )
{
    int i, pj;
    reax_list *bonds = (*lists) + BONDS;

    for ( i = 0; i < system->N; ++i )
        for ( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
            if ( i < bonds->select.bond_list[pj].nbr )
            {
                if ( control->virial == 0 )
                    Add_dBond_to_Forces( i, pj, workspace, lists );
                else
                    Add_dBond_to_Forces_NPT( i, pj, data, workspace, lists );
            }

    //Print_Total_Force( system, data, workspace );
#if defined(PURE_REAX)
    /* now all forces are computed to their partially-final values
       based on the neighbors information each processor has had.
       final values of force on each atom needs to be computed by adding up
       all partially-final pieces */
    Coll( system, mpi_data, workspace->f, mpi_data->mpi_rvec,
          sizeof(rvec) / sizeof(void), rvec_unpacker );
    for ( i = 0; i < system->n; ++i )
        rvec_Copy( system->my_atoms[i].f, workspace->f[i] );

#if defined(TEST_FORCES)
    Coll( system, mpi_data, workspace->f_ele, mpi_data->mpi_rvec, rvec_unpacker);
    Coll( system, mpi_data, workspace->f_vdw, mpi_data->mpi_rvec, rvec_unpacker);
    Coll( system, mpi_data, workspace->f_be, mpi_data->mpi_rvec, rvec_unpacker );
    Coll( system, mpi_data, workspace->f_lp, mpi_data->mpi_rvec, rvec_unpacker );
    Coll( system, mpi_data, workspace->f_ov, mpi_data->mpi_rvec, rvec_unpacker );
    Coll( system, mpi_data, workspace->f_un, mpi_data->mpi_rvec, rvec_unpacker );
    Coll( system, mpi_data, workspace->f_ang, mpi_data->mpi_rvec, rvec_unpacker);
    Coll( system, mpi_data, workspace->f_coa, mpi_data->mpi_rvec, rvec_unpacker);
    Coll( system, mpi_data, workspace->f_pen, mpi_data->mpi_rvec, rvec_unpacker);
    Coll( system, mpi_data, workspace->f_hb, mpi_data->mpi_rvec, rvec_unpacker );
    Coll( system, mpi_data, workspace->f_tor, mpi_data->mpi_rvec, rvec_unpacker);
    Coll( system, mpi_data, workspace->f_con, mpi_data->mpi_rvec, rvec_unpacker);
#endif

#endif
}


#ifdef HAVE_CUDA
void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
                               simulation_data *data, storage *workspace,
                               reax_list **lists, mpi_datatypes *mpi_data )
{
    rvec *f = (rvec *) host_scratch;
    memset (f, 0, sizeof (rvec) * system->N );

    Cuda_Total_Forces (system, control, data, workspace);

#if defined(PURE_REAX)
    /* now all forces are computed to their partially-final values
       based on the neighbors information each processor has had.
       final values of force on each atom needs to be computed by adding up
       all partially-final pieces */

    //MVAPICH2
    get_from_device (f, dev_workspace->f, sizeof (rvec) * system->N , "total_force:f:get");

    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
          sizeof(rvec) / sizeof(void), rvec_unpacker );

    put_on_device (f, dev_workspace->f, sizeof (rvec) * system->N, "total_force:f:put" );

    Cuda_Total_Forces_PURE (system, dev_workspace);
#endif

}
#endif



// Essentially no-cuda copies of cuda kernels, to be used only in the mpi-not-gpu version
////////////////////////
// HBOND ISSUE
void mpi_not_gpu_update_bonds (reax_atom *my_atoms,
                               reax_list bonds,
                               int n)
{
//    int i = blockIdx.x * blockDim.x + threadIdx.x;
    //  if (i >= n) return;
    int i;
    for (i = 0; i < n; i++)
    {
        my_atoms [i].num_bonds =
            MAX(Num_Entries(i, &bonds) * 2, MIN_BONDS);
    }
}


void mpi_not_gpu_update_hbonds (reax_atom *my_atoms,
                                reax_list hbonds,
                                int n)
{
    int Hindex;
    int i;
    //int i = blockIdx.x * blockDim.x + threadIdx.x;
    //if (i >= n) return;
    for (i = 0; i < n; i++)
    {
        Hindex = my_atoms[i].Hindex;
        my_atoms [i].num_hbonds =
            MAX(Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS);
    }
}

// Essentially a copy of cuda_validate_lists, but with all cuda-dependent kernels turned into serial versions
int MPI_Not_GPU_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control,
                                int step, int n, int N, int numH )
{
    int blocks;
    int i, comp, Hindex;
    int *index, *end_index;
    reax_list *bonds, *hbonds;
    reax_atom *my_atoms;
    reallocate_data *realloc;
    realloc = &( workspace->realloc);

    int max_sp_entries, num_hbonds, num_bonds;
    int total_sp_entries;







    //blocks = system->n / DEF_BLOCK_SIZE +
    //    ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);

    //ker_update_bonds <<< blocks, DEF_BLOCK_SIZE >>>
    //    (system->d_my_atoms, *(*lists + BONDS),
    //   system->n);
    //cudaThreadSynchronize ();
    //cudaCheckError ();
    mpi_not_gpu_update_bonds(system->my_atoms, *(*lists + BONDS), system->n);

    ////////////////////////
    // HBOND ISSUE
    //FIX - 4 - Added this check for hydrogen bond issue
    if ((control->hbond_cut > 0) && (system->numH > 0))
    {
        //ker_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
        //    (system->d_my_atoms, *(*lists + HBONDS),
        //     system->n);
        //cudaThreadSynchronize ();
        //cudaCheckError ();
        mpi_not_gpu_update_hbonds(system->my_atoms, *(*lists + HBONDS), system->n);

    }

    //validate sparse matrix entries.
    //memset (host_scratch, 0, 2 * system->N * sizeof (int));
    //index = (int *) host_scratch;
    //end_index = index + system->N;

    index = workspace->H.start;
    end_index = workspace->H.end;

    // immediately set these to host version since there is no device version.
    //memcpy(index, workspace->H.start, system->N * sizeof (int));
    //memcpy(end_index, workspace->H.end, system->N * sizeof (int));



    // don't need these, everything is already at host
    //copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int),
    //        cudaMemcpyDeviceToHost, "sparse_matrix:start" );
    //copy_host_device (end_index, dev_workspace->H.end, system->N * sizeof (int),
    //        cudaMemcpyDeviceToHost, "sparse_matrix:end" );
    max_sp_entries = total_sp_entries = 0;

    for (i = 0; i < n; i++ )
    {
        //if (i < N-1)
        //    comp = index [i+1];
        //else
        //    comp = dev_workspace->H.m;

        total_sp_entries += end_index [i] - index[i];
        if (end_index [i] - index[i] > system->max_sparse_entries)
        {
            fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
                     step, i, index[i], end_index[i] );
            return FAILURE;
        }
        else if (end_index[i] >= workspace->H.m)
        {
            //SUDHIR_FIX_SPARSE_MATRIX
            //TODO move this carver
            //TODO move this carver
            //TODO move this carver
            fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n",
                     system->my_rank, step, i, index[i], end_index[i]);
            //TODO move this carver
            //TODO move this carver
            //TODO move this carver
            return FAILURE;
        }
        else
        {
            if (max_sp_entries <= end_index[i] - index [i])
                max_sp_entries = end_index[i] - index [i];
        }
    }
    //if (max_sp_entries <= end_index[i] - index [i])
    //    max_sp_entries = end_index[i] - index [i];

    //update the current step max_sp_entries;
    realloc->Htop = max_sp_entries;
    fprintf (stderr, "p:%d - MPI-Not-GPU Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n",
             system->my_rank, workspace->H.n, workspace->H.m, total_sp_entries);

    if (total_sp_entries >= workspace->H.m)
    {
        fprintf (stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n",
                 system->my_rank, step, workspace->H.m, total_sp_entries);

        return FAILURE;
    }


    //validate Bond list
    if (N > 0)
    {
        num_bonds = 0;

        bonds = *lists + BONDS;
        //  memset (host_scratch, 0, 2 * bonds->n * sizeof (int));

        //  index = (int *) host_scratch;
        // end_index = index + bonds->n;
        index = bonds->index;
        end_index = bonds->end_index;


        //  memcpy(index, bonds->index, bonds->n * sizeof (int));
        // memcpy(end_index, bonds->end_index, bonds->n * sizeof (int));
        /*
                copy_host_device (index, bonds->index, bonds->n * sizeof (int),
                        cudaMemcpyDeviceToHost, "bonds:index");
                copy_host_device (end_index, bonds->end_index, bonds->n * sizeof (int),
                        cudaMemcpyDeviceToHost, "bonds:end_index");
        */
        /*
           for (i = 0; i < N; i++) {
           if (i < N-1)
           comp = index [i+1];
           else
           comp = bonds->num_intrs;

           if (end_index [i] > comp) {
           fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d str(i+1)=%d\n",
           step, i, index[i], end_index[i], comp );
           return FAILURE;
           }

           num_bonds += MAX( (end_index[i] - index[i]) * 4, MIN_BONDS);
           }

           if (end_index[N-1] >= bonds->num_intrs) {
           fprintf( stderr, "step%d-bondchk failed(end): i=N-1 start(i)=%d end(i)=%d num_intrs=%d\n",
           step, index[N-1], end_index[N-1], bonds->num_intrs);
           return FAILURE;
           }
           num_bonds = MAX( num_bonds, MIN_CAP*MIN_BONDS );
        //check the condition for reallocation
        realloc->num_bonds = num_bonds;
         */

        int max_bonds = 0;
        for (i = 0; i < N; i++)
        {
            if (end_index[i] - index[i] >= system->max_bonds)
            {
                fprintf( stderr, "MPI-Not-GPU step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n",
                         step, i, index[i], end_index[i], system->max_bonds);
                return FAILURE;
            }
            if (end_index[i] - index[i] >= max_bonds)
                max_bonds = end_index[i] - index[i];
        }
        realloc->num_bonds = max_bonds;

    }

    //validate Hbonds list
    num_hbonds = 0;
    // FIX - 4 - added additional check here
    if ((numH > 0) && (control->hbond_cut > 0))
    {
        hbonds = *lists + HBONDS;
        memset (host_scratch, 0, 2 * hbonds->n * sizeof (int) + sizeof (reax_atom) * system->N);
        index = (int *) host_scratch;
        end_index = index + hbonds->n;
        my_atoms = (reax_atom *)(end_index + hbonds->n);
        /*
                copy_host_device (index, hbonds->index, hbonds->n * sizeof (int),
                        cudaMemcpyDeviceToHost, "hbonds:index");
                copy_host_device (end_index, hbonds->end_index, hbonds->n * sizeof (int),
                        cudaMemcpyDeviceToHost, "hbonds:end_index");
                copy_host_device (my_atoms, system->d_my_atoms, system->N * sizeof (reax_atom),
                        cudaMemcpyDeviceToHost, "system:d_my_atoms");
        */
        //fprintf (stderr, " Total local atoms: %d \n", n);

        /*
           for (i = 0; i < N-1; i++) {
           Hindex = my_atoms [i].Hindex;
           if (Hindex > -1)
           comp = index [Hindex + 1];
           else
           comp = hbonds->num_intrs;

           if (end_index [Hindex] > comp) {
           fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d str(H+1)=%d\n",
           step, i, Hindex, index[Hindex], end_index[Hindex], comp );
           return FAILURE;
           }

           num_hbonds += MAX( (end_index [Hindex] - index [Hindex]) * 2, MIN_HBONDS * 2);
           }
           if (end_index [my_atoms[i].Hindex] > hbonds->num_intrs) {
           fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d num_intrs=%d\n",
           step, i, Hindex, index[Hindex], end_index[Hindex], hbonds->num_intrs);
           return FAILURE;
           }

           num_hbonds += MIN( (end_index [my_atoms[i].Hindex] - index [my_atoms[i].Hindex]) * 2,
           2 * MIN_HBONDS);
           num_hbonds = MAX( num_hbonds, MIN_CAP*MIN_HBONDS );
           realloc->num_hbonds = num_hbonds;
         */

        int max_hbonds = 0;
        for (i = 0; i < N; i++)
        {
            if (end_index[i] - index[i] >= system->max_hbonds)
            {
                fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d end(i)=%d max_hbonds=%d\n",
                         step, i, index[i], end_index[i], system->max_hbonds);
                return FAILURE;
            }
            if (end_index[i] - index[i] >= max_hbonds)
                max_hbonds = end_index[i] - index[i];
        }
        realloc->num_hbonds = max_hbonds;
    }

    return SUCCESS;
}

/*
void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
                     int step, int n, int N, int numH )
{
    int i, comp, Hindex;
    reax_list *bonds, *hbonds;
    reallocate_data *realloc;
    realloc = &(workspace->realloc);

    // bond list
    if ( N > 0 )
    {
        bonds = *lists + BONDS;

        for ( i = 0; i < N; ++i )
        {
            if ( i < n )
                system->my_atoms[i].num_bonds = MAX(Num_Entries(i, bonds) * 2, MIN_BONDS);

            //if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 )
            //workspace->realloc.bonds = 1;

            if ( i < N - 1 )
                comp = Start_Index(i + 1, bonds);
            else comp = bonds->num_intrs;

            if ( End_Index(i, bonds) > comp )
            {
                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
                         step, i, End_Index(i, bonds), comp );
                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
            }
        }
    }


    // hbonds list
    if ( numH > 0 )
    {
        hbonds = *lists + HBONDS;

        for ( i = 0; i < n; ++i )
        {
            Hindex = system->my_atoms[i].Hindex;
            if ( Hindex > -1 )
            {
                system->my_atoms[i].num_hbonds =
                    MAX( Num_Entries(Hindex, hbonds) * SAFER_ZONE, MIN_HBONDS );
*/
//if( Num_Entries(i, hbonds) >=
//(Start_Index(i+1,hbonds)-Start_Index(i,hbonds))*0.90/*DANGER_ZONE*/){
//  workspace->realloc.hbonds = 1;
/*
                //TODO
                if ( Hindex < system->n - 1 )
                    comp = Start_Index(Hindex + 1, hbonds);
                else comp = hbonds->num_intrs;

                if ( End_Index(Hindex, hbonds) > comp )
                {
                    fprintf(stderr, "step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
                            step, Hindex, End_Index(Hindex, hbonds), comp );
                    MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
                }
            }
        }
    }
}*/


void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
                     int step, int n, int N, int numH, MPI_Comm comm )
{
    int i, comp, Hindex;
    reax_list *bonds, *hbonds;
    reallocate_data *realloc;
    realloc = &(workspace->realloc);

    /* bond list */
    if ( N > 0 )
    {
        bonds = *lists + BONDS;

        for ( i = 0; i < N; ++i )
        {
            // if( i < n ) - we need to update ghost estimates for delayed nbrings
            system->my_atoms[i].num_bonds = MAX(Num_Entries(i, bonds) * 2, MIN_BONDS);

            //if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 )
            //workspace->realloc.bonds = 1;

            if ( i < N - 1 )
                comp = Start_Index(i + 1, bonds);
            else comp = bonds->num_intrs;

            if ( End_Index(i, bonds) > comp )
            {
                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
                         step, i, End_Index(i, bonds), comp );
                MPI_Abort( comm, INSUFFICIENT_MEMORY );
            }
        }
    }


    /* hbonds list */
    if ( numH > 0 )
    {
        hbonds = *lists + HBONDS;

        for ( i = 0; i < n; ++i )
        {
            Hindex = system->my_atoms[i].Hindex;
            if ( Hindex > -1 )
            {
                system->my_atoms[i].num_hbonds =
                    (int)(MAX( Num_Entries(Hindex, hbonds) * SAFER_ZONE, MIN_HBONDS ));

                //if( Num_Entries(i, hbonds) >=
                //(Start_Index(i+1,hbonds)-Start_Index(i,hbonds))*0.90/*DANGER_ZONE*/){
                //  workspace->realloc.hbonds = 1;

                if ( Hindex < numH - 1 )
                    comp = Start_Index(Hindex + 1, hbonds);
                else comp = hbonds->num_intrs;

                if ( End_Index(Hindex, hbonds) > comp )
                {
                    fprintf(stderr, "step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
                            step, Hindex, End_Index(Hindex, hbonds), comp );
                    MPI_Abort( comm, INSUFFICIENT_MEMORY );
                }
            }
            /*
                        if ( Hindex > -1 )
                        {
                            system->my_atoms[i].num_hbonds =
                                MAX( Num_Entries(Hindex, hbonds) * SAFER_ZONE, MIN_HBONDS );
            */
            //if( Num_Entries(i, hbonds) >=
            //(Start_Index(i+1,hbonds)-Start_Index(i,hbonds))*0.90/*DANGER_ZONE*/){
            //  workspace->realloc.hbonds = 1;
            /*
                            //TODO
                            if ( Hindex < system->n - 1 )
                                comp = Start_Index(Hindex + 1, hbonds);
                            else comp = hbonds->num_intrs;

                            if ( End_Index(Hindex, hbonds) > comp )
                            {
                                fprintf(stderr, "step%d-hbondchk failed: H=%d end(H)=%d str(H+1)=%d\n",
                                        step, Hindex, End_Index(Hindex, hbonds), comp );
                                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
                            }
                        }

            */



        }
    }
}




#if defined(OLD_VALIDATE)
void Validate_Lists( storage *workspace, reax_list **lists,
                     int step, int n, int N, int numH )
{
    int i, flag;
    reax_list *bonds, *hbonds;
    reallocate_data *realloc;
    realloc = &(workspace->realloc);

    /* bond list */
    if ( N > 0 )
    {
        flag = -1;
        bonds = *lists + BONDS;
        for ( i = 0; i < N - 1; ++i )
            if ( End_Index(i, bonds) >= Start_Index(i + 1, bonds) - 2 )
            {
                workspace->realloc.bonds = 1;
                if ( End_Index(i, bonds) > Start_Index(i + 1, bonds) )
                    flag = i;
            }

        if ( flag > -1 )
        {
            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
                     step, flag, End_Index(flag, bonds), Start_Index(flag + 1, bonds) );
            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
        }

        if ( End_Index(i, bonds) >= bonds->num_intrs - 2 )
        {
            workspace->realloc.bonds = 1;
            if ( End_Index(i, bonds) > bonds->num_intrs )
            {
                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
                         step, flag, End_Index(i, bonds), bonds->num_intrs );
                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
            }
        }
    }


    /* hbonds list */
    if ( numH > 0 )
    {
        flag = -1;
        hbonds = *lists + HBONDS;
        for ( i = 0; i < numH - 1; ++i )
            if ( Num_Entries(i, hbonds) >=
                    (Start_Index(i + 1, hbonds) - Start_Index(i, hbonds)) * 0.90/*DANGER_ZONE*/ )
            {
                workspace->realloc.hbonds = 1;
                if ( End_Index(i, hbonds) > Start_Index(i + 1, hbonds) )
                    flag = i;
            }

        if ( flag > -1 )
        {
            fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
                     step, flag, End_Index(flag, hbonds), Start_Index(flag + 1, hbonds) );
            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
        }

        if ( Num_Entries(i, hbonds) >=
                (hbonds->num_intrs - Start_Index(i, hbonds)) * 0.90/*DANGER_ZONE*/ )
        {
            workspace->realloc.hbonds = 1;
            if ( End_Index(i, hbonds) > hbonds->num_intrs )
            {
                fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
                         step, flag, End_Index(i, hbonds), hbonds->num_intrs );
                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
            }
        }
    }
}
#endif


inline real Compute_H( real r, real gamma, real *ctap )
{
    real taper, dr3gamij_1, dr3gamij_3;

    taper = ctap[7] * r + ctap[6];
    taper = taper * r + ctap[5];
    taper = taper * r + ctap[4];
    taper = taper * r + ctap[3];
    taper = taper * r + ctap[2];
    taper = taper * r + ctap[1];
    taper = taper * r + ctap[0];

    dr3gamij_1 = ( r * r * r + gamma );
    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
    return taper * EV_to_KCALpMOL / dr3gamij_3;
}


inline real Compute_tabH( real r_ij, int ti, int tj, int num_atom_types )
{
    int r, tmin, tmax;
    real val, dif, base;
    LR_lookup_table *t;

    tmin  = MIN( ti, tj );
    tmax  = MAX( ti, tj );
    //SUDHIR
    //t = &( LR[tmin][tmax] );
    t = &( LR[index_lr (tmin, tmax, num_atom_types)] );

    /* cubic spline interpolation */
    r = (int)(r_ij * t->inv_dx);
    if ( r == 0 )  ++r;
    base = (real)(r + 1) * t->dx;
    dif = r_ij - base;
    val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif +
          t->ele[r].a;
    val *= EV_to_KCALpMOL / C_ele;

    return val;
}


void Init_Forces( reax_system *system, control_params *control,
                  simulation_data *data, storage *workspace,
                  reax_list **lists, output_controls *out_control )
{
    int i, j, pj;
    int start_i, end_i;
    int type_i, type_j;
    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
    int ihb, jhb, ihb_top, jhb_top;
    int local, flag, renbr;
    real r_ij, cutoff;
    sparse_matrix *H;
    reax_list *far_nbrs, *bonds, *hbonds;
    single_body_parameters *sbp_i, *sbp_j;
    two_body_parameters *twbp;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;

    far_nbrs = *lists + FAR_NBRS;
    bonds = *lists + BONDS;
    hbonds = *lists + HBONDS;

    //Print_List(*lists + BONDS);


    for ( i = 0; i < system->n; ++i )
        workspace->bond_mark[i] = 0;
    for ( i = system->n; i < system->N; ++i )
    {
        workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
        //workspace->done_after[i] = Start_Index( i, far_nbrs );
    }

    H = &(workspace->H); //MATRIX CHANGES
    H->n = system->n;
    Htop = 0;
    num_bonds = 0;
    num_hbonds = 0;
    btop_i = btop_j = 0;
    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;

    for ( i = 0; i < system->N; ++i )
    {
        atom_i = &(system->my_atoms[i]);
        type_i  = atom_i->type;
        start_i = Start_Index(i, far_nbrs);
        end_i   = End_Index(i, far_nbrs);
        btop_i = End_Index( i, bonds );
        sbp_i = &(system->reax_param.sbp[type_i]);

        if ( i < system->n )
        {
            local = 1;
            cutoff = control->nonb_cut;
        }
        else
        {
            local = 0;
            cutoff = control->bond_cut;
        }

        ihb = -1;
        ihb_top = -1;
        if ( local )
        {
            H->start[i] = Htop;
            H->entries[Htop].j = i;
            H->entries[Htop].val = sbp_i->eta;
            ++Htop;

            if ( control->hbond_cut > 0 )
            {
                ihb = sbp_i->p_hbond;
                if ( ihb == 1 )
                    ihb_top = End_Index( atom_i->Hindex, hbonds );
                else ihb_top = -1;
            }
        }

        /* update i-j distance - check if j is within cutoff */
        for ( pj = start_i; pj < end_i; ++pj )
        {
            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
            j = nbr_pj->nbr;
            atom_j = &(system->my_atoms[j]);
            //fprintf( stderr, "%d%d i=%d x_i: %f %f %f,j=%d x_j: %f %f %f, d=%f\n",
            //     MIN(atom_i->orig_id, atom_j->orig_id),
            //     MAX(atom_i->orig_id, atom_j->orig_id),
            //     i, atom_i->x[0], atom_i->x[1], atom_i->x[2],
            //     j, atom_j->x[0], atom_j->x[1], atom_j->x[2], nbr_pj->d );
            if ( renbr )
            {
                if (nbr_pj->d <= cutoff)
                    flag = 1;
                else flag = 0;
            }
            else
            {
                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
                if ( nbr_pj->d <= SQR(cutoff) )
                {
                    nbr_pj->d = sqrt(nbr_pj->d);
                    flag = 1;
                }
                else
                {
                    flag = 0;
                }
            }

            if ( flag )
            {
                type_j = atom_j->type;
                r_ij = nbr_pj->d;
                sbp_j = &(system->reax_param.sbp[type_j]);
                //SUDHIR
                //twbp = &(system->reax_param.tbp[type_i][type_j]);
                twbp = &(system->reax_param.tbp[ index_tbp (type_i, type_j, system->reax_param.num_atom_types)]);

                if ( local )
                {
                    /* H matrix entry */
                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq||1
                    {
                        H->entries[Htop].j = j;
                        //fprintf( stdout, "%d%d %d %d\n",
                        //     MIN(atom_i->orig_id, atom_j->orig_id),
                        //     MAX(atom_i->orig_id, atom_j->orig_id),
                        //     MIN(atom_i->orig_id, atom_j->orig_id),
                        //     MAX(atom_i->orig_id, atom_j->orig_id) );
                        if ( control->tabulate == 0 )
                            H->entries[Htop].val = Compute_H(r_ij, twbp->gamma, workspace->Tap);
                        else
                            H->entries[Htop].val = Compute_tabH(r_ij, type_i, type_j, system->reax_param.num_atom_types);
                        ++Htop;
                    }

                    /* hydrogen bond lists */
                    if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
                            nbr_pj->d <= control->hbond_cut )
                    {
                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
                        jhb = sbp_j->p_hbond;
                        if ( ihb == 1 && jhb == 2 )
                        {
                            hbonds->select.hbond_list[ihb_top].nbr = j;
                            hbonds->select.hbond_list[ihb_top].scl = 1;
                            hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
                            ++ihb_top;
                            ++num_hbonds;
                        }
                        else if ( j < system->n && ihb == 2 && jhb == 1 )
                        {
                            jhb_top = End_Index( atom_j->Hindex, hbonds );
                            hbonds->select.hbond_list[jhb_top].nbr = i;
                            hbonds->select.hbond_list[jhb_top].scl = -1;
                            hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbonds );
                            ++num_hbonds;
                        }
                    }
                }

                /* uncorrected bond orders */
                if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
                    nbr_pj->d <= control->bond_cut &&
                    BOp( workspace, bonds, control->bo_cut,
                         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) )
                {
                    num_bonds += 2;
                    ++btop_i;

                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                    {
                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
                        //if( workspace->bond_mark[i] == 1000 )
                        //  workspace->done_after[i] = pj;
                    }
                    //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
                    //   i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
                }
            }
        }

//        H->end[i] = Htop;

        Set_End_Index( i, btop_i, bonds );
        if ( local )
        {
            //printf("Htop: %d \n", Htop);
            H->end[i] = Htop;
            if ( ihb == 1 )
                Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
        }
    }

    //fprintf( stderr, "after the first init loop\n" );
    /*for( i = system->n; i < system->N; ++i )
      if( workspace->bond_mark[i] > 3 ) {
        start_i = Start_Index(i, bonds);
        end_i = End_Index(i, bonds);
        num_bonds -= (end_i - start_i);
        Set_End_Index(i, start_i, bonds );
        }*/

    /*for( i = system->n; i < system->N; ++i ) {
      start_i = Start_Index(i, far_nbrs);
      end_i = workspace->done_after[i];

      if( workspace->bond_mark[i] >= 2 && start_i < end_i ) {
        atom_i = &(system->my_atoms[i]);
        type_i = atom_i->type;
        btop_i = End_Index( i, bonds );
        sbp_i = &(system->reax_param.sbp[type_i]);

        for( pj = start_i; pj < end_i; ++pj ) {
    nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
    j = nbr_pj->nbr;

    if( workspace->bond_mark[j] >= 2 && nbr_pj->d <= control->bond_cut ) {
      atom_j = &(system->my_atoms[j]);
      type_j = atom_j->type;
      sbp_j = &(system->reax_param.sbp[type_j]);
      twbp = &(system->reax_param.tbp[type_i][type_j]);

      if( BOp( workspace, bonds, control->bo_cut,
         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) ) {
        num_bonds += 2;
        ++btop_i;

        if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
          workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
        else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
          workspace->bond_mark[i] = workspace->bond_mark[j] + 1;

        //fprintf( stdout, "%d%d - %d(%d) %d(%d) new\n",
        // i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
      }
    }
        }
        Set_End_Index( i, btop_i, bonds );
      }
      }*/

    workspace->realloc.Htop = Htop;
    workspace->realloc.num_bonds = num_bonds;
    workspace->realloc.num_hbonds = num_hbonds;

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: Htop = %d num_bonds = %d num_hbonds = %d\n",
             system->my_rank, data->step, Htop, num_bonds, num_hbonds );
    MPI_Barrier( MPI_COMM_WORLD );
#endif
#if defined( DEBUG )
    // Print_Bonds( system, bonds, "debugbonds.out" );
    //  Print_Bond_List2( system, bonds, "pbonds.out" );
    // Print_Sparse_Matrix( system, H );
    /*    for ( i = 0; i < H->n; ++i )
            for ( j = H->start[i]; j < H->end[i]; ++j )
                fprintf( stderr, "%d %d %.15e\n",
                         MIN(system->my_atoms[i].orig_id,
                             system->my_atoms[H->entries[j].j].orig_id),
                         MAX(system->my_atoms[i].orig_id,
                             system->my_atoms[H->entries[j].j].orig_id),
                         H->entries[j].val );*/
#endif
    //Print_List(*lists + BONDS);


//reax_system *system, storage *workspace, reax_list **lists,
    //                   int step, int n, int N, int numH )

    /*
        Validate_Lists( system, workspace, lists, control,
                        data->step, system->n, system->N, system->numH );*/

    MPI_Not_GPU_Validate_Lists( system, workspace, lists, control,
                                data->step, system->n, system->N, system->numH );


}


void Init_Forces_noQEq( reax_system *system, control_params *control,
        simulation_data *data, storage *workspace, reax_list **lists,
        output_controls *out_control )
{
    int i, j, pj;
    int start_i, end_i;
    int type_i, type_j;
    int btop_i, btop_j, num_bonds, num_hbonds;
    int ihb, jhb, ihb_top, jhb_top;
    int local, flag, renbr;
    real r_ij, cutoff;
    reax_list *far_nbrs, *bonds, *hbonds;
    single_body_parameters *sbp_i, *sbp_j;
    two_body_parameters *twbp;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;

    far_nbrs = *lists + FAR_NBRS;
    bonds = *lists + BONDS;
    hbonds = *lists + HBONDS;

    for ( i = 0; i < system->n; ++i )
        workspace->bond_mark[i] = 0;
    for ( i = system->n; i < system->N; ++i )
    {
        workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance
        //workspace->done_after[i] = Start_Index( i, far_nbrs );
    }

    num_bonds = 0;
    num_hbonds = 0;
    btop_i = btop_j = 0;
    renbr = (data->step - data->prev_steps) % control->reneighbor == 0;

    for ( i = 0; i < system->N; ++i )
    {
        atom_i = &(system->my_atoms[i]);
        type_i  = atom_i->type;
        start_i = Start_Index(i, far_nbrs);
        end_i   = End_Index(i, far_nbrs);
        btop_i = End_Index( i, bonds );
        sbp_i = &(system->reax_param.sbp[type_i]);

        if ( i < system->n )
        {
            local = 1;
            cutoff = MAX( control->hbond_cut, control->bond_cut );
        }
        else
        {
            local = 0;
            cutoff = control->bond_cut;
        }

        ihb = -1;
        ihb_top = -1;
        if ( local && control->hbond_cut > 0 )
        {
            ihb = sbp_i->p_hbond;
            if ( ihb == 1 )
                ihb_top = End_Index( atom_i->Hindex, hbonds );
            else ihb_top = -1;
        }

        /* update i-j distance - check if j is within cutoff */
        for ( pj = start_i; pj < end_i; ++pj )
        {
            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
            j = nbr_pj->nbr;
            atom_j = &(system->my_atoms[j]);

            if ( renbr )
            {
                if ( nbr_pj->d <= cutoff )
                    flag = 1;
                else flag = 0;
            }
            else
            {
                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
                if ( nbr_pj->d <= SQR(cutoff) )
                {
                    nbr_pj->d = sqrt(nbr_pj->d);
                    flag = 1;
                }
                else
                {
                    flag = 0;
                }
            }

            if ( flag )
            {
                type_j = atom_j->type;
                r_ij = nbr_pj->d;
                sbp_j = &(system->reax_param.sbp[type_j]);
                //SUDHIR
                //twbp = &(system->reax_param.tbp[type_i][type_j]);
                twbp = &(system->reax_param.tbp[index_tbp(type_i, type_j, system->reax_param.num_atom_types)]);

                if ( local )
                {
                    /* hydrogen bond lists */
                    if ( control->hbond_cut > 0 && (ihb == 1 || ihb == 2) &&
                            nbr_pj->d <= control->hbond_cut )
                    {
                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
                        jhb = sbp_j->p_hbond;
                        if ( ihb == 1 && jhb == 2 )
                        {
                            hbonds->select.hbond_list[ihb_top].nbr = j;
                            hbonds->select.hbond_list[ihb_top].scl = 1;
                            hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
                            ++ihb_top;
                            ++num_hbonds;
                        }
                        else if ( j < system->n && ihb == 2 && jhb == 1 )
                        {
                            jhb_top = End_Index( atom_j->Hindex, hbonds );
                            hbonds->select.hbond_list[jhb_top].nbr = i;
                            hbonds->select.hbond_list[jhb_top].scl = -1;
                            hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
                            Set_End_Index( atom_j->Hindex, jhb_top + 1, hbonds );
                            ++num_hbonds;
                        }
                    }
                }


                /* uncorrected bond orders */
                if ( //(workspace->bond_mark[i] < 3 || workspace->bond_mark[j] < 3) &&
                    nbr_pj->d <= control->bond_cut &&
                    BOp( workspace, bonds, control->bo_cut,
                         i , btop_i, nbr_pj, sbp_i, sbp_j, twbp ) )
                {
                    num_bonds += 2;
                    ++btop_i;

                    if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
                        workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
                    else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
                    {
                        workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
                        //if( workspace->bond_mark[i] == 1000 )
                        //  workspace->done_after[i] = pj;
                    }
                    //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n",
                    //   i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] );
                }
            }
        }

        Set_End_Index( i, btop_i, bonds );
        if ( local && ihb == 1 )
        {
            Set_End_Index( atom_i->Hindex, ihb_top, hbonds );
        }
    }

    for ( i = system->n; i < system->N; ++i )
    {
        if ( workspace->bond_mark[i] > 3 )
        {
            start_i = Start_Index(i, bonds);
            end_i = End_Index(i, bonds);
            num_bonds -= (end_i - start_i);
            Set_End_Index(i, start_i, bonds );
        }
    }

    workspace->realloc.num_bonds = num_bonds;
    workspace->realloc.num_hbonds = num_hbonds;

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: num_bonds = %d num_hbonds = %d\n",
             system->my_rank, data->step, num_bonds, num_hbonds );
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#if defined( DEBUG )
    Print_Bonds( system, bonds, "debugbonds.out" );
    Print_Bond_List2( system, bonds, "pbonds.out" );
#endif

    MPI_Not_GPU_Validate_Lists( system, workspace, lists, control,
            data->step, system->n, system->N, system->numH );
}

void Host_Estimate_Sparse_Matrix(reax_atom *my_atoms, control_params *control,
                                  reax_list p_far_nbrs, int n, int N, int renbr, int *indices)
{
    int i, j, pj;
    int start_i, end_i;
    int flag;
    real cutoff;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;
    reax_list *far_nbrs = &( p_far_nbrs );

    //i = blockIdx.x * blockDim.x + threadIdx.x;
    //if (i >= N) return;
    for (i = 0; i < N; i++)
    {
        atom_i = &(my_atoms[i]);
        start_i = Start_Index(i, far_nbrs);
        end_i   = End_Index(i, far_nbrs);

        cutoff = control->nonb_cut;

        //++Htop;
        if ( i < n)
            indices [i] ++;

        /* update i-j distance - check if j is within cutoff */
        for ( pj = start_i; pj < end_i; ++pj )
        {
            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
            j = nbr_pj->nbr;
            atom_j = &(my_atoms[j]);
            if ( renbr )
            {
                if (nbr_pj->d <= cutoff)
                    flag = 1;
                else flag = 0;
            }
            else
            {
                if (i < j)
                {
                    nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
                    nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
                    nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
                }
                else
                {
                    nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
                    nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
                    nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
                }
                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
                //TODO
                //TODO
                //TODO
                //if( nbr_pj->d <= (cutoff) ) {
                if ( nbr_pj->d <= SQR(cutoff) )
                {
                    nbr_pj->d = sqrt(nbr_pj->d);
                    flag = 1;
                }
                else
                {
                    flag = 0;
                }
            }

            if ( flag )
            {
                /* H matrix entry */
                //if( j < n || atom_i->orig_id < atom_j->orig_id )
                //++Htop;
                //    indices [i] ++;
                //else if (j < n || atom_i->orig_id > atom_j->orig_id )
                //    indices [i] ++;

                //if ((i < n) || (j < n))
                //    indices [i] ++;
                //if ((i < n) && (i < j) && ((j < n) || atom_i->orig_id < atom_j->orig_id))
                //    indices [i] ++;
                //if ( i >= n && j < n && atom_i->orig_id > atom_j->orig_id)
                //    indices [i] ++;
                //else if ((i >=n) && (i > j) && ((j < n) || (atom_i->orig_id > atom_j->orig_id)))
                //    indices [i] ++;
                //THIS IS THE HOST CONDITION
                //if (i < n && i < j && ( j < n || atom_i->orig_id < atom_j->orig_id ))
                //if (i < n && i < j && atom_i->orig_id < atom_j->orig_id && j >=n)
                //    indices [i] ++;
                //THIS IS THE DEVICE CONDITION
                //if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
                //    indices [i] ++;

                //this is the working condition
                if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
                    indices [i]++;
                else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
                    indices [i] ++;
                else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
                    indices [i] ++;
            }
        }
    }
}


#ifdef HAVE_CUDA
void Estimate_Storages( reax_system *system, control_params *control,
                        reax_list **lists, int *Htop,
                        int *hb_top, int *bond_top, int *num_3body )
{
    int i, j, pj;
    int start_i, end_i;
    int type_i, type_j;
    int ihb, jhb;
    int local;
    real cutoff;
    real r_ij, r2;
    real C12, C34, C56;
    real BO, BO_s, BO_pi, BO_pi2;
    reax_list *far_nbrs;
    single_body_parameters *sbp_i, *sbp_j;
    two_body_parameters *twbp;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;

    far_nbrs = *lists + FAR_NBRS;
    *Htop = 0;
    //printf("Hcap: %d \n", system->Hcap);
    memset( hb_top, 0, sizeof(int) * system->Hcap );
    memset( bond_top, 0, sizeof(int) * system->total_cap );
    *num_3body = 0;

    for ( i = 0; i < system->N; ++i )
    {
        atom_i = &(system->my_atoms[i]);
        type_i  = atom_i->type;
        start_i = Start_Index(i, far_nbrs);
        end_i   = End_Index(i, far_nbrs);
        sbp_i = &(system->reax_param.sbp[type_i]);

        if ( i < system->n )
        {
            local = 1;
            cutoff = control->nonb_cut;
            ++(*Htop);
            ihb = sbp_i->p_hbond;
        }
        else
        {
            local = 0;
            cutoff = control->bond_cut;
            ihb = -1;
        }

        for ( pj = start_i; pj < end_i; ++pj )
        {
            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
            j = nbr_pj->nbr;
            atom_j = &(system->my_atoms[j]);

            if (nbr_pj->d <= cutoff)
            {
                type_j = system->my_atoms[j].type;
                r_ij = nbr_pj->d;
                sbp_j = &(system->reax_param.sbp[type_j]);
                //SUDHIR
                //twbp = &(system->reax_param.tbp[type_i][type_j]);
                twbp = &(system->reax_param.tbp[index_tbp (type_i, type_j, system->reax_param.num_atom_types)]);

                if ( local )
                {
                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
                        ++(*Htop);


                    if ( control->hbond_cut > 0.1 && (ihb == 1 || ihb == 2) &&
                            nbr_pj->d <= control->hbond_cut )
                    {
                        jhb = sbp_j->p_hbond;
                        if ( ihb == 1 && jhb == 2 )
                            ++hb_top[i];
                        else if ( j < system->n && ihb == 2 && jhb == 1 )
                        {
                            ++hb_top[j];

                        }
                    }
                }

                // uncorrected bond orders
                if ( nbr_pj->d <= control->bond_cut )
                {
                    r2 = SQR(r_ij);

                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
                    {
                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
                    }
                    else BO_s = C12 = 0.0;

                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
                    {
                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                        BO_pi = EXP( C34 );
                    }
                    else BO_pi = C34 = 0.0;

                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
                    {
                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
                        BO_pi2 = EXP( C56 );
                    }
                    else BO_pi2 = C56 = 0.0;

                    // Initially BO values are the uncorrected ones, page 1
                    BO = BO_s + BO_pi + BO_pi2;

                    if ( BO >= control->bo_cut )
                    {
                        ++bond_top[i];
                        ++bond_top[j];
                    }
                }
            }
        }
    }

    fprintf (stderr, "HOST SPARSE MATRIX ENTRIES: %d \n",  *Htop );
    *Htop = MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES );


    int hbond_count = 0;
    for ( i = 0; i < system->n; ++i )
    {
        hbond_count += hb_top[i];
        hb_top[i] = MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS );
    }
    fprintf (stderr, "HOST HBOND COUNT: %d \n", hbond_count);

    int bond_count = 0;
    for ( i = 0; i < system->N; ++i )
    {
        bond_count += bond_top[i];
        *num_3body += SQR(bond_top[i]);
        //if( i < system->n )
        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
        //else bond_top[i] = MAX_BONDS;
    }
    fprintf (stderr, "HOST BOND COUNT: %d \n", bond_count);

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ estimate storages: Htop = %d, num_3body = %d\n",
             system->my_rank, *Htop, *num_3body );
    MPI_Barrier( MPI_COMM_WORLD );
#endif
}


#else
void Estimate_Storages( reax_system *system, control_params *control,
                        reax_list **lists, int *Htop, int *hb_top,
                        int *bond_top, int *num_3body)
{

    int i, j, pj;
    int start_i, end_i;
    int type_i, type_j;
    int ihb, jhb;
    int local;
    real cutoff;
    real r_ij, r2;
    real C12, C34, C56;
    real BO, BO_s, BO_pi, BO_pi2;
    reax_list *far_nbrs;
    single_body_parameters *sbp_i, *sbp_j;
    two_body_parameters *twbp;
    far_neighbor_data *nbr_pj;
    reax_atom *atom_i, *atom_j;

    far_nbrs = *lists + FAR_NBRS;
    *Htop = 0;
    memset( hb_top, 0, sizeof(int) * system->local_cap );
    memset( bond_top, 0, sizeof(int) * system->total_cap );
    *num_3body = 0;

    for ( i = 0; i < system->N; ++i )
    {
        atom_i = &(system->my_atoms[i]);
        type_i  = atom_i->type;
        start_i = Start_Index(i, far_nbrs);
        end_i   = End_Index(i, far_nbrs);
        sbp_i = &(system->reax_param.sbp[type_i]);

        if ( i < system->n )
        {
            local = 1;
            cutoff = control->nonb_cut;
            ++(*Htop);
            ihb = sbp_i->p_hbond;
        }
        else
        {
            local = 0;
            cutoff = control->bond_cut;
            ihb = -1;
        }

        for ( pj = start_i; pj < end_i; ++pj )
        {
            //nbr_pj = &( far_nbrs->far_nbr_list[pj] );
            nbr_pj = &( far_nbrs->select.far_nbr_list[pj]);
            j = nbr_pj->nbr;
            atom_j = &(system->my_atoms[j]);

            if (nbr_pj->d <= cutoff)
            {
                type_j = system->my_atoms[j].type;
                r_ij = nbr_pj->d;
                sbp_j = &(system->reax_param.sbp[type_j]);
                //twbp = &(system->reax_param.tbp[type_i][type_j]);
                twbp = &(system->reax_param.tbp[index_tbp (type_i, type_j, system->reax_param.num_atom_types)]);

                if ( local )
                {
                    if ( j < system->n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
                        ++(*Htop);

                    /* hydrogen bond lists */
                    if ( control->hbond_cut > 0.1 && (ihb == 1 || ihb == 2) &&
                            nbr_pj->d <= control->hbond_cut )
                    {
                        jhb = sbp_j->p_hbond;
                        if ( ihb == 1 && jhb == 2 )
                            ++hb_top[i];
                        else if ( j < system->n && ihb == 2 && jhb == 1 )
                            ++hb_top[j];
                    }
                }

                /* uncorrected bond orders */
                if ( nbr_pj->d <= control->bond_cut )
                {
                    r2 = SQR(r_ij);

                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
                    {
                        C12 = twbp->p_bo1 * pow( r_ij / twbp->r_s, twbp->p_bo2 );
                        BO_s = (1.0 + control->bo_cut) * exp( C12 );
                    }
                    else BO_s = C12 = 0.0;

                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
                    {
                        C34 = twbp->p_bo3 * pow( r_ij / twbp->r_p, twbp->p_bo4 );
                        BO_pi = exp( C34 );
                    }
                    else BO_pi = C34 = 0.0;

                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
                    {
                        C56 = twbp->p_bo5 * pow( r_ij / twbp->r_pp, twbp->p_bo6 );
                        BO_pi2 = exp( C56 );
                    }
                    else BO_pi2 = C56 = 0.0;

                    /* Initially BO values are the uncorrected ones, page 1 */
                    BO = BO_s + BO_pi + BO_pi2;

                    if ( BO >= control->bo_cut )
                    {
                        ++bond_top[i];
                        ++bond_top[j];
                    }
                }
            }
        }
    }

    *Htop = (int)(MAX( *Htop * SAFE_ZONE, MIN_CAP * MIN_HENTRIES ));

    // Set max sparse entries, needed for first iteration of validate_list
    system->max_sparse_entries = *Htop * SAFE_ZONE;

    for ( i = 0; i < system->n; ++i )
        hb_top[i] = (int)(MAX( hb_top[i] * SAFER_ZONE, MIN_HBONDS ));

    for ( i = 0; i < system->N; ++i )
    {
        *num_3body += SQR(bond_top[i]);
        //if( i < system->n )
        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
        //else bond_top[i] = MAX_BONDS;
    }

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ estimate storages: Htop = %d, num_3body = %d\n",
             system->my_rank, *Htop, *num_3body );
    MPI_Barrier( MPI_COMM_WORLD );
#endif
}
#endif

void Compute_Forces( reax_system *system, control_params *control,
                     simulation_data *data, storage *workspace,
                     reax_list **lists, output_controls *out_control,
                     mpi_datatypes *mpi_data )
{
    int qeq_flag;
#if defined(LOG_PERFORMANCE)
    real t_start = 0;

    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        t_start = Get_Time( );
#endif

    /********* init forces ************/
    if ( control->qeq_freq && (data->step - data->prev_steps) % control->qeq_freq == 0 )
        qeq_flag = 1;
    else qeq_flag = 0;

    if ( qeq_flag )
        Init_Forces( system, control, data, workspace, lists, out_control );
    else
        Init_Forces_noQEq( system, control, data, workspace, lists, out_control );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.init_forces) );
#endif


    /********* bonded interactions ************/
    Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.bonded) );
#endif
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: completed bonded\n",
             system->my_rank, data->step );
    MPI_Barrier( MPI_COMM_WORLD );
#endif



    /**************** qeq ************************/
#if defined(PURE_REAX)
    if ( qeq_flag )
        QEq( system, control, data, workspace, out_control, mpi_data );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.qEq) );
#endif
#if defined(DEBUG_FOCUS)
    fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
    MPI_Barrier( MPI_COMM_WORLD );
#endif
#endif //PURE_REAX




    /********* nonbonded interactions ************/
    Compute_NonBonded_Forces( system, control, data, workspace,
                              lists, out_control, mpi_data );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.nonb) );
#endif
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
             system->my_rank, data->step );
    MPI_Barrier( MPI_COMM_WORLD );
#endif


    /*********** total force ***************/
    Compute_Total_Force( system, control, data, workspace, lists, mpi_data );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.bonded) );
#endif
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: total forces computed\n",
             system->my_rank, data->step );
    //Print_Total_Force( system, data, workspace );
    MPI_Barrier( MPI_COMM_WORLD );
#endif

#if defined(TEST_FORCES)
    Print_Force_Files( system, control, data, workspace,
                       lists, out_control, mpi_data );
#endif
}


#ifdef HAVE_CUDA
void Cuda_Compute_Forces( reax_system *system, control_params *control,
                          simulation_data *data, storage *workspace, reax_list **lists,
                          output_controls *out_control, mpi_datatypes *mpi_data )
{
    int qeq_flag, retVal = SUCCESS;

#if defined(LOG_PERFORMANCE)
    real t_start = 0;

    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
    {
        t_start = Get_Time( );
    }
#endif

    /********* init forces ************/
    if ( control->qeq_freq && (data->step - data->prev_steps) % control->qeq_freq == 0 )
    {
        qeq_flag = 1;
    }
    else
    {
        qeq_flag = 0;
    }

    if ( qeq_flag )
    {
        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
    }
    else
    {
        retVal = Cuda_Init_Forces_noQEq( system, control, data, workspace, lists, out_control );
    }

    if ( retVal == FAILURE )
    {
        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
    }
    //validate_sparse_matrix (system, workspace);

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
    {
        Update_Timing_Info( &t_start, &(data->timing.init_forces) );
    }
#endif


    /********* bonded interactions ************/
    retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
    if (retVal == FAILURE)
    {
        MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
    }

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
    {
        Update_Timing_Info( &t_start, &(data->timing.bonded) );
    }
#endif

#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: completed bonded\n",
             system->my_rank, data->step );
    MPI_Barrier( MPI_COMM_WORLD );
#endif

    /**************** qeq ************************/
#if defined(PURE_REAX)
    if ( qeq_flag )
    {
        Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
    }

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
    {
        Update_Timing_Info( &t_start, &(data->timing.qEq) );
    }
#endif

#if defined(DEBUG_FOCUS)
    fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
    MPI_Barrier( MPI_COMM_WORLD );
#endif
#endif //PURE_REAX


    /********* nonbonded interactions ************/
    Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
                                   lists, out_control, mpi_data );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.nonb) );
#endif
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
             system->my_rank, data->step );
    MPI_Barrier( MPI_COMM_WORLD );
#endif

    /*********** total force ***************/
    Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );

#if defined(LOG_PERFORMANCE)
    //MPI_Barrier( MPI_COMM_WORLD );
    if ( system->my_rank == MASTER_NODE )
        Update_Timing_Info( &t_start, &(data->timing.bonded) );
#endif
#if defined(DEBUG_FOCUS)
    fprintf( stderr, "p%d @ step%d: total forces computed\n",
             system->my_rank, data->step );
    //Print_Total_Force( system, data, workspace );
    MPI_Barrier( MPI_COMM_WORLD );
#endif

}
#endif


int validate_device( reax_system *system, simulation_data *data,
        storage *workspace, reax_list **lists )
{
    int retval = FAILURE;

#ifdef __CUDA_DEBUG__


    //retval |= validate_neighbors (system, lists);
    //retval |= validate_sym_dbond_indices (system, workspace, lists);
    //retval |= validate_hbonds (system, workspace, lists);
    //retval |= validate_workspace (system, workspace);
    //retval |= validate_bonds (system, workspace, lists);
    //retval |= validate_three_bodies (system, workspace, lists );
    retval |= validate_sparse_matrix (system, workspace);
    //retval |= validate_data (system, data);
    //retval |= validate_atoms (system, lists);
    //analyze_hbonds (system, workspace, lists);

    if (!retval)
    {
        fprintf (stderr, "Results *DOES NOT* mattch between device and host \n");
    }
#endif

    return retval;

}