-
Kurt A. O'Hearn authored
PG-PuReMD: improve memory management around temporary storage space on both the host and device (i.e., scratch space).
Kurt A. O'Hearn authoredPG-PuReMD: improve memory management around temporary storage space on both the host and device (i.e., scratch space).
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
cuda_forces.cu 58.67 KiB
#include "cuda_forces.h"
#include "cuda_bonds.h"
#include "cuda_bond_orders.h"
#include "cuda_charges.h"
#include "cuda_helpers.h"
#include "cuda_hydrogen_bonds.h"
#include "cuda_list.h"
#include "cuda_multi_body.h"
#include "cuda_neighbors.h"
#include "cuda_nonbonded.h"
#include "cuda_reduction.h"
#include "cuda_spar_lin_alg.h"
#include "cuda_torsion_angles.h"
#include "cuda_utils.h"
#include "cuda_valence_angles.h"
#include "../basic_comm.h"
#include "../forces.h"
#include "../index_utils.h"
#include "../tool_box.h"
#include "../vector.h"
typedef enum
{
DIAGONAL = 0,
OFF_DIAGONAL = 1,
} MATRIX_ENTRY_POSITION;
CUDA_DEVICE real Init_Charge_Matrix_Entry( single_body_parameters *sbp_i, real *workspace_Tap,
control_params *control, int i, int j, real r_ij, real gamma, MATRIX_ENTRY_POSITION pos )
{
real Tap, dr3gamij_1, dr3gamij_3, ret;
ret = 0.0;
switch ( control->charge_method )
{
case QEQ_CM:
case EE_CM:
case ACKS2_CM:
switch ( pos )
{
case OFF_DIAGONAL:
Tap = workspace_Tap[7] * r_ij + workspace_Tap[6];
Tap = Tap * r_ij + workspace_Tap[5];
Tap = Tap * r_ij + workspace_Tap[4];
Tap = Tap * r_ij + workspace_Tap[3];
Tap = Tap * r_ij + workspace_Tap[2];
Tap = Tap * r_ij + workspace_Tap[1];
Tap = Tap * r_ij + workspace_Tap[0];
/* shielding */
dr3gamij_1 = r_ij * r_ij * r_ij
+ POW( gamma, -3.0 );
dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
/* i == j: periodic self-interaction term
* i != j: general interaction term */
ret = ((i == j) ? 0.5 : 1.0) * Tap * EV_to_KCALpMOL / dr3gamij_3;
break;
case DIAGONAL:
ret = sbp_i->eta;
break;
default:
// fprintf( stderr, "[ERROR] Invalid matrix position. Terminating...\n" );
// exit( INVALID_INPUT );
break;
}
break;
default:
// fprintf( stderr, "[ERROR] Invalid charge method. Terminating...\n" );
// exit( INVALID_INPUT );
break;
}
return ret;
}
CUDA_DEVICE real Init_Charge_Matrix_Entry_Tab( LR_lookup_table *t_LR, real r_ij,
int ti, int tj, int num_atom_types )
{
int r, tmin, tmax;
real val, dif, base;
LR_lookup_table *t;
tmin = MIN( ti, tj );
tmax = MAX( ti, tj );
t = &t_LR[ index_lr(tmin,tmax, num_atom_types) ];
/* cubic spline interpolation */
r = (int)(r_ij * t->inv_dx);
if ( r == 0 )
{
++r;
}
base = (real)(r + 1) * t->dx;
dif = r_ij - base;
val = ((t->ele[r].d * dif + t->ele[r].c) * dif + t->ele[r].b) * dif
+ t->ele[r].a;
val *= EV_to_KCALpMOL / C_ELE;
return val;
}
CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
{
control->hbond_cut = 0.0;
}
CUDA_GLOBAL void k_init_end_index( int * intr_cnt, int *indices, int *end_indices, int N )
{
int i;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
end_indices[i] = indices[i] + intr_cnt[i];
}
CUDA_GLOBAL void k_setup_hindex( reax_atom *my_atoms, int N )
{
int i;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
my_atoms[i].Hindex = i;
}
CUDA_GLOBAL void k_init_hbond_indices( reax_atom * atoms, single_body_parameters *sbp,
int *hbonds, int *max_hbonds, int *indices, int *end_indices, int N )
{
int i, hindex, my_hbonds;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
hindex = atoms[i].Hindex;
if ( sbp[ atoms[i].type ].p_hbond == H_ATOM
|| sbp[ atoms[i].type ].p_hbond == H_BONDING_ATOM )
{
my_hbonds = hbonds[i];
indices[hindex] = max_hbonds[i];
end_indices[hindex] = indices[hindex] + hbonds[i];
}
else
{
my_hbonds = 0;
indices[hindex] = 0;
end_indices[hindex] = 0;
}
atoms[i].num_hbonds = my_hbonds;
}
CUDA_GLOBAL void k_print_hbond_info( reax_atom *my_atoms, single_body_parameters *sbp,
control_params *control, reax_list hbond_list, int N )
{
int i;
int type_i;
int ihb, ihb_top;
single_body_parameters *sbp_i;
reax_atom *atom_i;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
atom_i = &my_atoms[i];
type_i = atom_i->type;
sbp_i = &sbp[type_i];
if ( control->hbond_cut > 0.0 )
{
ihb = sbp_i->p_hbond;
if ( ihb == H_ATOM || ihb == H_BONDING_ATOM )
{
ihb_top = Start_Index( atom_i->Hindex, &hbond_list );
}
else
{
ihb_top = -1;
}
}
printf( "atom %6d: ihb = %2d, ihb_top = %2d\n", i, ihb, ihb_top );
}
/* Compute the distances and displacement vectors for entries
* in the far neighbors list if it's a NOT re-neighboring step */
CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, int N )
{
int i, j, pj;
int start_i, end_i;
reax_atom *atom_i, *atom_j;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
atom_i = &my_atoms[i];
start_i = Start_Index( i, &far_nbr_list );
end_i = End_Index( i, &far_nbr_list );
/* update distance and displacement vector between atoms i and j (i-j) */
for ( pj = start_i; pj < end_i; ++pj )
{
j = far_nbr_list.far_nbr_list.nbr[pj];
atom_j = &my_atoms[j];
if ( i < j )
{
far_nbr_list.far_nbr_list.dvec[pj][0] = atom_j->x[0] - atom_i->x[0];
far_nbr_list.far_nbr_list.dvec[pj][1] = atom_j->x[1] - atom_i->x[1];
far_nbr_list.far_nbr_list.dvec[pj][2] = atom_j->x[2] - atom_i->x[2];
}
else
{
far_nbr_list.far_nbr_list.dvec[pj][0] = atom_i->x[0] - atom_j->x[0];
far_nbr_list.far_nbr_list.dvec[pj][1] = atom_i->x[1] - atom_j->x[1];
far_nbr_list.far_nbr_list.dvec[pj][2] = atom_i->x[2] - atom_j->x[2];
}
far_nbr_list.far_nbr_list.d[pj] = rvec_Norm( far_nbr_list.far_nbr_list.dvec[pj] );
}
}
/* Compute the charge matrix entries and store the matrix in full format
* using the far neighbors list (stored in full format) and according to
* the full shell communication method */
CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters *sbp,
two_body_parameters *tbp, storage workspace, control_params *control,
reax_list far_nbr_list, LR_lookup_table *t_LR, int n, int N, int num_atom_types,
int *max_cm_entries, int *realloc_cm_entries )
{
int i, j, pj;
int start_i, end_i;
int type_i, type_j;
int cm_top;
int num_cm_entries;
real r_ij;
single_body_parameters *sbp_i;
two_body_parameters *twbp;
reax_atom *atom_i, *atom_j;
sparse_matrix *H;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
H = &workspace.H;
cm_top = H->start[i];
atom_i = &my_atoms[i];
type_i = atom_i->type;
start_i = Start_Index( i, &far_nbr_list );
end_i = End_Index( i, &far_nbr_list );
sbp_i = &sbp[type_i];
if ( i < n )
{
/* diagonal entry in the matrix */
H->j[cm_top] = i;
H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control,
i, i, 0.0, 0.0, DIAGONAL );
++cm_top;
/* update i-j distance - check if j is within cutoff */
for ( pj = start_i; pj < end_i; ++pj )
{
j = far_nbr_list.far_nbr_list.nbr[pj];
if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
{
atom_j = &my_atoms[j];
type_j = atom_j->type;
twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];
r_ij = far_nbr_list.far_nbr_list.d[pj];
H->j[cm_top] = j;
if ( control->tabulate == 0 )
{
H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL );
}
else
{
H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij, type_i, type_j,num_atom_types );
}
++cm_top;
}
}
}
H->end[i] = cm_top;
num_cm_entries = cm_top - H->start[i];
/* reallocation checks */
if ( num_cm_entries > max_cm_entries[i] )
{
*realloc_cm_entries = TRUE;
}
}
CUDA_GLOBAL void k_init_bond( reax_atom *my_atoms, single_body_parameters *sbp,
two_body_parameters *tbp, storage workspace, control_params *control,
reax_list far_nbr_list, reax_list bond_list, reax_list hbond_list,
LR_lookup_table *t_LR, int n, int N, int num_atom_types,
int *max_bonds, int *realloc_bonds,
int *max_hbonds, int *realloc_hbonds )
{
int i, j, pj;
int start_i, end_i;
int type_i, type_j;
int btop_i, ihb, jhb, ihb_top;
int num_bonds, num_hbonds;
real cutoff;
single_body_parameters *sbp_i, *sbp_j;
two_body_parameters *twbp;
reax_atom *atom_i, *atom_j;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
atom_i = &my_atoms[i];
type_i = atom_i->type;
start_i = Start_Index( i, &far_nbr_list );
end_i = End_Index( i, &far_nbr_list );
btop_i = Start_Index( i, &bond_list );
sbp_i = &sbp[type_i];
ihb = NON_H_BONDING_ATOM;
ihb_top = -1;
if ( i < n )
{
cutoff = control->nonb_cut;
workspace.bond_mark[i] = 0;
}
else
{
cutoff = control->bond_cut;
/* put ghost atoms to an infinite distance (i.e., 1000) */
workspace.bond_mark[i] = 1000;
}
if ( control->hbond_cut > 0.0 )
{
ihb = sbp_i->p_hbond;
if ( ihb == H_ATOM || ihb == H_BONDING_ATOM )
{
ihb_top = Start_Index( atom_i->Hindex, &hbond_list );
}
else
{
ihb_top = -1;
}
}
/* update i-j distance - check if j is within cutoff */
for ( pj = start_i; pj < end_i; ++pj )
{
j = far_nbr_list.far_nbr_list.nbr[pj];
atom_j = &my_atoms[j];
if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
{
type_j = atom_j->type;
sbp_j = &sbp[type_j];
ihb = sbp_i->p_hbond;
jhb = sbp_j->p_hbond;
/* atom i: H bonding, ghost
* atom j: H atom, native */
if ( control->hbond_cut > 0.0 && i >= n && j < n
&& ihb == H_BONDING_ATOM && jhb == H_ATOM
&& far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
{
hbond_list.hbond_list[ihb_top].nbr = j;
hbond_list.hbond_list[ihb_top].scl = -1;
hbond_list.hbond_list[ihb_top].ptr = pj;
/* CUDA-specific */
hbond_list.hbond_list[ihb_top].sym_index = -1;
rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
++ihb_top;
}
}
if ( far_nbr_list.far_nbr_list.d[pj] <= cutoff )
{
type_j = atom_j->type;
sbp_j = &sbp[type_j];
twbp = &tbp[ index_tbp(type_i, type_j, num_atom_types) ];
if ( i < n )
{
/* hydrogen bond lists */
if ( control->hbond_cut > 0.0
&& (ihb == H_ATOM || ihb == H_BONDING_ATOM)
&& far_nbr_list.far_nbr_list.d[pj] <= control->hbond_cut )
{
jhb = sbp_j->p_hbond;
/* atom i: H atom, native
* atom j: H bonding atom */
if ( ihb == H_ATOM && jhb == H_BONDING_ATOM )
{
hbond_list.hbond_list[ihb_top].nbr = j;
if ( i < j )
{
hbond_list.hbond_list[ihb_top].scl = 1;
}
else
{
hbond_list.hbond_list[ihb_top].scl = -1;
}
hbond_list.hbond_list[ihb_top].ptr = pj;
/* CUDA-specific */
hbond_list.hbond_list[ihb_top].sym_index = -1;
rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
++ihb_top;
}
/* atom i: H bonding atom, native
* atom j: H atom, native */
else if ( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
{
//jhb_top = End_Index( atom_j->Hindex, &hbond_list );
hbond_list.hbond_list[ihb_top].nbr = j;
hbond_list.hbond_list[ihb_top].scl = -1;
hbond_list.hbond_list[ihb_top].ptr = pj;
/* CUDA-specific */
hbond_list.hbond_list[ihb_top].sym_index = -1;
rvec_MakeZero( hbond_list.hbond_list[ihb_top].hb_f );
++ihb_top;
}
}
}
/* uncorrected bond orders */
if ( far_nbr_list.far_nbr_list.d[pj] <= control->bond_cut
&& Cuda_BOp( bond_list, control->bo_cut,
i, btop_i, far_nbr_list.far_nbr_list.nbr[pj],
&far_nbr_list.far_nbr_list.rel_box[pj], far_nbr_list.far_nbr_list.d[pj],
&far_nbr_list.far_nbr_list.dvec[pj], far_nbr_list.format,
sbp_i, sbp_j, twbp, workspace.dDeltap_self,
workspace.total_bond_order ) == TRUE )
{
++btop_i;
/* TODO: Need to do later... since i and j are parallel */
// if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
// {
// workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
// }
// else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 )
// {
// workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
// }
}
}
}
Set_End_Index( i, btop_i, &bond_list );
if ( control->hbond_cut > 0.0 && ihb_top > 0
&& (ihb == H_ATOM || ihb == H_BONDING_ATOM) )
{
Set_End_Index( atom_i->Hindex, ihb_top, &hbond_list );
}
num_bonds = btop_i - Start_Index( i, &bond_list );
num_hbonds = ihb_top - Start_Index( atom_i->Hindex, &hbond_list );
/* copy (h)bond info to atom structure
* (needed for atom ownership transfer via MPI) */
my_atoms[i].num_bonds = num_bonds;
my_atoms[i].num_hbonds = num_hbonds;
/* reallocation checks */
if ( num_bonds > max_bonds[i] )
{
*realloc_bonds = TRUE;
}
if ( num_hbonds > max_hbonds[i] )
{
*realloc_hbonds = TRUE;
}
}
CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
single_body_parameters *sbp, two_body_parameters *tbp,
control_params *control, reax_list p_far_nbr_list,
int num_atom_types, int n, int N, int total_cap,
int *cm_entries, int *max_cm_entries,
int *bonds, int *max_bonds,
int *hbonds, int *max_hbonds )
{
int i, j, pj;
int start_i, end_i;
int type_i, type_j;
int ihb, jhb;
int local;
int num_bonds, num_hbonds, num_cm_entries;
real cutoff;
real r_ij;
real C12, C34, C56;
real BO, BO_s, BO_pi, BO_pi2;
single_body_parameters *sbp_i, *sbp_j;
two_body_parameters *twbp;
reax_atom *atom_i, *atom_j;
reax_list *far_nbr_list;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= total_cap )
{
return;
}
far_nbr_list = &p_far_nbr_list;
num_bonds = 0;
num_hbonds = 0;
num_cm_entries = 0;
if ( i < N )
{
atom_i = &my_atoms[i];
type_i = atom_i->type;
start_i = Start_Index( i, far_nbr_list );
end_i = End_Index( i, far_nbr_list );
sbp_i = &sbp[type_i];
if ( i < n )
{
local = TRUE;
cutoff = control->nonb_cut;
/* diagonal entry */
++num_cm_entries;
// ihb = sbp_i->p_hbond;
}
else
{
local = FALSE;
cutoff = control->bond_cut;
// ihb = NON_H_BONDING_ATOM;
}
ihb = NON_H_BONDING_ATOM;
for ( pj = start_i; pj < end_i; ++pj )
{
j = far_nbr_list->far_nbr_list.nbr[pj];
atom_j = &my_atoms[j];
if ( far_nbr_list->far_nbr_list.d[pj] <= control->nonb_cut )
{
type_j = my_atoms[j].type;
sbp_j = &sbp[type_j];
ihb = sbp_i->p_hbond;
jhb = sbp_j->p_hbond;
//TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
if ( local == TRUE )
{
++num_cm_entries;
}
/* atom i: H bonding, ghost
* atom j: H atom, native */
if ( control->hbond_cut > 0.0
&& far_nbr_list->far_nbr_list.d[pj] <= control->hbond_cut
&& ihb == H_BONDING_ATOM && jhb == H_ATOM && i >= n && j < n )
{
++num_hbonds;
}
// if ( i >= n )
// {
// ihb = NON_H_BONDING_ATOM;
// }
}
if ( far_nbr_list->far_nbr_list.d[pj] <= cutoff )
{
type_j = my_atoms[j].type;
r_ij = far_nbr_list->far_nbr_list.d[pj];
sbp_j = &sbp[type_j];
twbp = &tbp[ index_tbp(type_i ,type_j, num_atom_types) ];
if ( local == TRUE )
{
/* atom i: H atom OR H bonding atom, native */
if ( control->hbond_cut > 0.0
&& (ihb == H_ATOM || ihb == H_BONDING_ATOM)
&& far_nbr_list->far_nbr_list.d[pj] <= control->hbond_cut )
{
jhb = sbp_j->p_hbond;
/* atom i: H atom, native
* atom j: H bonding atom */
if( ihb == H_ATOM && jhb == H_BONDING_ATOM )
{
++num_hbonds;
}
/* atom i: H bonding atom, native
* atom j: H atom, native */
else if( ihb == H_BONDING_ATOM && jhb == H_ATOM && j < n )
{
++num_hbonds;
}
}
}
/* uncorrected bond orders */
if ( far_nbr_list->far_nbr_list.d[pj] <= control->bond_cut )
{
if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
{
C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
BO_s = (1.0 + control->bo_cut) * EXP( C12 );
}
else
{
C12 = 0.0;
BO_s = 0.0;
}
if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
{
C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
BO_pi = EXP( C34 );
}
else
{
C34 = 0.0;
BO_pi = 0.0;
}
if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
{
C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
BO_pi2= EXP( C56 );
}
else
{
C56 = 0.0;
BO_pi2 = 0.0;
}
/* initially BO values are the uncorrected ones, page 1 */
BO = BO_s + BO_pi + BO_pi2;
if ( BO >= control->bo_cut )
{
++num_bonds;
}
}
}
}
}
bonds[i] = num_bonds;
max_bonds[i] = MAX( (int)(num_bonds * 2), MIN_BONDS );
hbonds[i] = num_hbonds;
max_hbonds[i] = MAX( (int)(num_hbonds * SAFE_ZONE), MIN_HBONDS );
cm_entries[i] = num_cm_entries;
max_cm_entries[i] = MAX( (int)(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
}
CUDA_GLOBAL void k_init_bond_mark( int offset, int n, int *bond_mark )
{
int i;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= n )
{
return;
}
bond_mark[offset + threadIdx.x] = 1000;
}
CUDA_GLOBAL void k_update_sym_dbond_indices( reax_list bond_list, int N )
{
int i, pj, pk, nbr_ij, nbr_jk;
bond_data *ibond, *jbond;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= N )
{
return;
}
/* i-j bonds */
for ( pj = Start_Index(i, &bond_list); pj < End_Index(i, &bond_list); ++pj )
{
ibond = &bond_list.bond_list[pj];
nbr_ij = ibond->nbr;
/* j-k bonds */
for ( pk = Start_Index(nbr_ij, &bond_list); pk < End_Index(nbr_ij, &bond_list); ++pk )
{
jbond = &bond_list.bond_list[pk];
nbr_jk = jbond->nbr;
if ( i == nbr_jk && i > nbr_ij )
{
ibond->dbond_index = pj;
jbond->dbond_index = pj;
ibond->sym_index = pk;
jbond->sym_index = pj;
}
}
}
}
CUDA_GLOBAL void k_update_sym_hbond_indices( reax_atom *my_atoms, reax_list hbond_list, int N )
{
int i, j, k;
int nbr, nbrstart, nbrend;
int start, end;
hbond_data *ihbond, *jhbond;
int __THREADS_PER_ATOM__;
int thread_id;
int warp_id;
int lane_id;
__THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
thread_id = blockIdx.x * blockDim.x + threadIdx.x;
warp_id = thread_id / __THREADS_PER_ATOM__;
if ( warp_id > N )
{
return;
}
lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
i = warp_id;
start = Start_Index( my_atoms[i].Hindex, &hbond_list );
end = End_Index( my_atoms[i].Hindex, &hbond_list );
j = start + lane_id;
while ( j < end )
{
ihbond = &hbond_list.hbond_list[j];
nbr = ihbond->nbr;
nbrstart = Start_Index( my_atoms[nbr].Hindex, &hbond_list );
nbrend = End_Index( my_atoms[nbr].Hindex, &hbond_list );
for ( k = nbrstart; k < nbrend; k++ )
{
jhbond = &hbond_list.hbond_list[k];
if ( jhbond->nbr == i )
{
ihbond->sym_index = k;
jhbond->sym_index = j;
break;
}
}
j += __THREADS_PER_ATOM__;
}
}
#if defined(DEBUG_FOCUS)
CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
{
int i;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= n )
{
return;
}
printf( "%8d: %24.15f, %24.15f, %24.15f\n",
my_atoms[i].orig_id,
f[i][0],
f[i][1],
f[i][2] );
}
CUDA_GLOBAL void k_print_hbonds( reax_atom *my_atoms, reax_list hbond_list, int n, int rank, int step )
{
int i, k, pj, start, end;
hbond_data *hbond_jk;
i = blockIdx.x * blockDim.x + threadIdx.x;
if ( i >= n )
{
return;
}
start = Start_Index( my_atoms[i].Hindex, &hbond_list );
end = End_Index( my_atoms[i].Hindex, &hbond_list );
for ( pj = start; pj < end; ++pj )
{
k = hbond_list.hbond_list[pj].nbr;
hbond_jk = &hbond_list.hbond_list[pj];
printf( "p%03d, step %05d: %8d: %8d, %24.15f, %24.15f, %24.15f\n",
rank, step, my_atoms[i].Hindex, k,
hbond_jk->hb_f[0],
hbond_jk->hb_f[1],
hbond_jk->hb_f[2] );
}
}
#endif
CUDA_GLOBAL void k_bond_mark( reax_list p_bond_list, storage p_workspace, int N )
{
int i, j, k;
reax_list *bond_list;
storage *workspace;
// i = blockIdx.x * blockDim.x + threadIdx.x;
// if ( i >= N )
// {
// return;
// }
bond_list = &p_bond_list;
workspace = &p_workspace;
for ( i = 0; i < N; i++ )
{
for ( k = Start_Index( i, bond_list ); k < End_Index( i, bond_list ); k++ )
{
j = bond_list->bond_list[k].nbr;
if ( i < j )
{
if ( workspace->bond_mark[j] > (workspace->bond_mark[i] + 1) )
{
workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
}
else if ( workspace->bond_mark[i] > (workspace->bond_mark[j] + 1) )
{
workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
}
}
}
}
}
static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *control,
storage *workspace, int step, reax_list **lists, int *thbody )
{
int ret;
ret = SUCCESS;
cuda_memset( thbody, 0, system->total_bonds * sizeof(int),
"Cuda_Estimate_Storage_Three_Body::thbody" );
Estimate_Cuda_Valence_Angles <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, (control_params *)control->d_control_params,
*(lists[BONDS]), system->n, system->N, thbody );
cudaDeviceSynchronize( );
cudaCheckError( );
Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds );
copy_host_device( &system->total_thbodies, system->d_total_thbodies, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" );
if ( step == 0 )
{
system->total_thbodies = MAX( (int)(system->total_thbodies * SAFE_ZONE), MIN_3BODIES );
system->total_thbodies_indices = system->total_bonds;
/* create Three-body list */
Cuda_Make_List( system->total_thbodies_indices, system->total_thbodies,
TYP_THREE_BODY, lists[THREE_BODIES] );
}
if ( system->total_thbodies > lists[THREE_BODIES]->max_intrs ||
system->total_bonds > lists[THREE_BODIES]->n )
{
if ( system->total_thbodies > lists[THREE_BODIES]->max_intrs )
{
system->total_thbodies = MAX( (int)(lists[THREE_BODIES]->max_intrs * SAFE_ZONE),
system->total_thbodies );
}
if ( system->total_bonds > lists[THREE_BODIES]->n )
{
system->total_thbodies_indices = MAX( (int)(lists[THREE_BODIES]->n * SAFE_ZONE),
system->total_bonds );
}
workspace->d_workspace->realloc.thbody = TRUE;
ret = FAILURE;
}
return ret;
}
#if defined(DEBUG_FOCUS)
static void Print_Forces( reax_system *system )
{
int blocks;
blocks = (system->n) / DEF_BLOCK_SIZE +
(((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
k_print_forces <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, workspace->d_workspace->f, system->n );
cudaDeviceSynchronize( );
cudaCheckError( );
}
static void Print_HBonds( reax_system *system, int step )
{
int blocks;
blocks = (system->n) / DEF_BLOCK_SIZE +
(((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
k_print_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, *(lists[HBONDS]), system->n, system->my_rank, step );
cudaDeviceSynchronize( );
cudaCheckError( );
}
#endif
/* Initialize indices for far neighbors list post reallocation
*
* system: atomic system info. */
void Cuda_Init_Neighbor_Indices( reax_system *system, reax_list **lists )
{
int blocks;
reax_list *far_nbr_list = lists[FAR_NBRS];
/* init indices */
Cuda_Scan_Excl_Sum( system->d_max_far_nbrs, far_nbr_list->index, system->total_cap );
/* init end_indices */
blocks = system->total_cap / DEF_BLOCK_SIZE +
((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1);
k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_far_nbrs, far_nbr_list->index, far_nbr_list->end_index, system->total_cap );
cudaDeviceSynchronize( );
cudaCheckError( );
}
/* Initialize indices for far hydrogen bonds list post reallocation
*
* system: atomic system info. */
void Cuda_Init_HBond_Indices( reax_system *system, storage *workspace,
reax_list **lists )
{
int blocks, *temp;
reax_list *hbond_list;
blocks = system->N / DEF_BLOCK_SIZE +
((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
hbond_list = lists[HBONDS];
cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
sizeof(int) * system->total_cap,
"Cuda_Init_HBond_Indices::workspace->scratch" );
temp = (int *) workspace->scratch;
/* init Hindices */
k_setup_hindex <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
/* init indices and end_indices */
Cuda_Scan_Excl_Sum( system->d_max_hbonds, temp, system->total_cap );
k_init_hbond_indices <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, system->reax_param.d_sbp, system->d_hbonds, temp,
hbond_list->index, hbond_list->end_index, system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
}
/* Initialize indices for far bonds list post reallocation
*
* system: atomic system info. */
void Cuda_Init_Bond_Indices( reax_system *system, reax_list **lists )
{
int blocks;
reax_list *bond_list;
bond_list = lists[BONDS];
/* init indices */
Cuda_Scan_Excl_Sum( system->d_max_bonds, bond_list->index, system->total_cap );
/* init end_indices */
blocks = system->N / DEF_BLOCK_SIZE +
((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_bonds, bond_list->index, bond_list->end_index, system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
}
/* Initialize indices for charge matrix post reallocation
*
* system: atomic system info.
* H: charge matrix */
void Cuda_Init_Sparse_Matrix_Indices( reax_system *system, sparse_matrix *H )
{
int blocks;
/* init indices */
Cuda_Scan_Excl_Sum( system->d_max_cm_entries, H->start, system->total_cap );
/* init end_indices */
blocks = system->total_cap / DEF_BLOCK_SIZE
+ ((system->total_cap % DEF_BLOCK_SIZE == 0) ? 0 : 1);
k_init_end_index <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_cm_entries, H->start, H->end, system->total_cap );
cudaDeviceSynchronize( );
cudaCheckError( );
}
/* Initialize indices for three body list post reallocation
*
* indices: list indices
* entries: num. of entries in list */
void Cuda_Init_Three_Body_Indices( int *indices, int entries, reax_list **lists )
{
reax_list *thbody;
thbody = lists[THREE_BODIES];
Cuda_Scan_Excl_Sum( indices, thbody->index, entries );
}
void Cuda_Estimate_Storages( reax_system *system, control_params *control,
reax_list **lists, int realloc_bonds, int realloc_hbonds, int realloc_cm,
int step )
{
int blocks;
blocks = system->total_cap / ST_BLOCK_SIZE +
(((system->total_cap % ST_BLOCK_SIZE == 0)) ? 0 : 1);
k_estimate_storages <<< blocks, ST_BLOCK_SIZE >>>
( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
(control_params *)control->d_control_params,
*(lists[FAR_NBRS]), system->reax_param.num_atom_types,
system->n, system->N, system->total_cap,
system->d_cm_entries, system->d_max_cm_entries,
system->d_bonds, system->d_max_bonds,
system->d_hbonds, system->d_max_hbonds );
cudaDeviceSynchronize( );
cudaCheckError( );
if ( realloc_bonds == TRUE )
{
Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds,
system->total_cap );
copy_host_device( &system->total_bonds, system->d_total_bonds, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" );
}
if ( system->numH > 0 && control->hbond_cut > 0.0 )
{
if ( realloc_hbonds == TRUE )
{
Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
system->total_cap );
copy_host_device( &system->total_hbonds, system->d_total_hbonds, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
}
}
else
{
if ( step == 0 )
{
#if defined(DEBUG_FOCUS)
if ( system->numH == 0 )
{
fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: NO HYDROGEN ATOMS FOUND\n" );
}
#endif
#if defined(DEBUG_FOCUS)
if ( control->hbond_cut <= 0.0 )
{
fprintf( stderr, "[INFO] DISABLING HYDROGEN BOND COMPUTATION: BOND CUTOFF LENGTH IS ZERO\n" );
}
#endif
control->hbond_cut = 0.0;
k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params );
}
}
if ( realloc_cm == TRUE )
{
Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, system->total_cap );
copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
}
}
int Cuda_Init_Forces( reax_system *system, control_params *control,
simulation_data *data, storage *workspace,
reax_list **lists, output_controls *out_control )
{
int ret, ret_bonds, ret_hbonds, ret_cm;
int blocks, hblocks;
/* init the workspace (bond_mark) */
// cuda_memset( workspace->d_workspace->bond_mark, 0, sizeof(int) * system->n, "bond_mark" );
//
// blocks = (system->N - system->n) / DEF_BLOCK_SIZE +
// (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
// k_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
// ( system->n, (system->N - system->n), workspace->d_workspace->bond_mark );
// cudaDeviceSynchronize( );
// cudaCheckError( );
blocks = (system->N) / DEF_BLOCK_SIZE +
(((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
// k_print_hbond_info <<< blocks, DEF_BLOCK_SIZE >>>
// ( system->d_my_atoms, system->reax_param.d_sbp,
// (control_params *)control->d_control_params,
// *(lists[HBONDS]), system->N );
// cudaDeviceSynchronize( );
// cudaCheckError( );
/* reset reallocation flags on device */
cuda_memset( system->d_realloc_bonds, FALSE, sizeof(int),
"Cuda_Init_Forces::d_realloc_bonds" );
cuda_memset( system->d_realloc_hbonds, FALSE, sizeof(int),
"Cuda_Init_Forces::d_realloc_hbonds" );
cuda_memset( system->d_realloc_cm_entries, FALSE, sizeof(int),
"Cuda_Init_Forces::d_realloc_cm_entries" );
if ( (data->step - data->prev_steps) % control->reneighbor != 0 )
{
k_init_distance <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, *(lists[FAR_NBRS]), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
}
// if ( workspace->H.format == SYM_HALF_MATRIX )
// {
// k_init_cm_half_fs <<< blocks, DEF_BLOCK_SIZE >>>
// ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
// *(workspace->d_workspace), (control_params *)control->d_control_params,
// *(lists[FAR_NBRS]), workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types,
// system->d_max_cm_entries, system->d_realloc_cm_entries );
// cudaDeviceSynchronize( );
// cudaCheckError( );
// }
// else
// {
k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
*(workspace->d_workspace), (control_params *)control->d_control_params,
*(lists[FAR_NBRS]), workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types,
system->d_max_cm_entries, system->d_realloc_cm_entries );
cudaDeviceSynchronize( );
cudaCheckError( );
// }
k_init_bond <<< blocks, DEF_BLOCK_SIZE >>>
( system->d_my_atoms, system->reax_param.d_sbp,
system->reax_param.d_tbp, *(workspace->d_workspace), (control_params *)control->d_control_params,
*(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]),
workspace->d_LR, system->n, system->N, system->reax_param.num_atom_types,
system->d_max_bonds, system->d_realloc_bonds,
system->d_max_hbonds, system->d_realloc_hbonds );
cudaDeviceSynchronize( );
cudaCheckError( );
/* check reallocation flags on device */
copy_host_device( &ret_bonds, system->d_realloc_bonds, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
copy_host_device( &ret_hbonds, system->d_realloc_hbonds, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
copy_host_device( &ret_cm, system->d_realloc_cm_entries, sizeof(int),
cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" );
ret = (ret_bonds == FALSE && ret_hbonds == FALSE && ret_cm == FALSE) ? SUCCESS : FAILURE;
#if defined(DEBUG_FOCUS)
fprintf( stderr, "[INFO] p%d, step %d: ret = %d, ret_bonds = %d, ret_hbonds = %d, ret_cm = %d\n",
system->my_rank, data->step, ret, ret_bonds, ret_hbonds, ret_cm );
#endif
if ( ret == SUCCESS )
{
k_update_sym_dbond_indices <<< blocks, control->block_size >>>
( *(lists[BONDS]), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
if ( control->hbond_cut > 0.0 && system->numH > 0 )
{
/* make hbond_list symmetric */
hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM / HB_SYM_BLOCK_SIZE) +
((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1);
k_update_sym_hbond_indices <<< hblocks, HB_BLOCK_SIZE >>>
( system->d_my_atoms, *(lists[HBONDS]), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
}
/* update bond_mark */
// k_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
// k_bond_mark <<< 1, 1 >>>
// ( *(lists[BONDS]), *(workspace->d_workspace), system->N );
// cudaDeviceSynchronize( );
// cudaCheckError( );
}
else
{
Cuda_Estimate_Storages( system, control, lists,
ret_bonds, ret_hbonds, ret_cm, data->step );
workspace->d_workspace->realloc.bonds = ret_bonds;
workspace->d_workspace->realloc.hbonds = ret_hbonds;
workspace->d_workspace->realloc.cm = ret_cm;
}
return ret;
}
int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
simulation_data *data, storage *workspace,
reax_list **lists, output_controls *out_control )
{
//TODO: implement later when figure out bond_mark usage
return FAILURE;
}
int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
simulation_data *data, storage *workspace,
reax_list **lists, output_controls *out_control )
{
int update_energy, ret;
// int hbs, hnbrs_blocks;
int *thbody;
static int compute_bonded_part1 = FALSE;
real *spad;
rvec *rvec_spad;
#if defined(DEBUG_FOCUS)
real t_start, t_elapsed;
#endif
cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
MAX( sizeof(real) * 2 * system->N,
MAX( sizeof(real) * 3 * system->n,
MAX( (sizeof(real) * 6 + sizeof(rvec) * 2) * system->N,
MAX( (sizeof(real) * 4 + sizeof(rvec) * 2) * system->n,
(sizeof(real) + sizeof(rvec)) * 2 * system->n )))),
"Cuda_Compute_Bonded_Forces::workspace->scratch" );
spad = (real *) workspace->scratch;
update_energy = (out_control->energy_update_freq > 0
&& data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
ret = SUCCESS;
if ( compute_bonded_part1 == FALSE )
{
/* 1. Bond Order Interactions */
#if defined(DEBUG_FOCUS)
t_start = Get_Time( );
fprintf( stderr, " Begin Bonded Forces ... %d x %d\n",
control->blocks_n, control->block_size );
#endif
Cuda_BO_Part1 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, system->reax_param.d_sbp,
*(workspace->d_workspace), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
Cuda_BO_Part2 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp,
system->reax_param.d_tbp, *(workspace->d_workspace),
*(lists[BONDS]),
system->reax_param.num_atom_types, system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
Cuda_BO_Part3 <<< control->blocks_n, control->block_size >>>
( *(workspace->d_workspace), *(lists[BONDS]), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
Cuda_BO_Part4 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp,
*(workspace->d_workspace), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
#if defined(DEBUG_FOCUS)
t_elapsed = Get_Timing_Info( t_start );
fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n",
cudaGetLastError( ), t_elapsed );
fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
#endif
/* 2. Bond Energy Interactions */
#if defined(DEBUG_FOCUS)
t_start = Get_Time( );
#endif
cuda_memset( spad, 0, sizeof(real) * 2 * system->N,
"Compute_Bonded_Forces::spad" );
Cuda_Bonds <<< control->blocks, control->block_size, sizeof(real) * control->block_size >>>
( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
*(workspace->d_workspace), *(lists[BONDS]),
system->n, system->reax_param.num_atom_types, spad );
cudaDeviceSynchronize( );
cudaCheckError( );
/* reduction for E_BE */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_bond,
system->n );
}
#if defined(DEBUG_FOCUS)
t_elapsed = Get_Timing_Info( t_start );
fprintf( stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n",
cudaGetLastError( ), t_elapsed );
fprintf( stderr, "Cuda_Bond_Energy Done... \n" );
#endif
/* 3. Atom Energy Interactions */
#if defined(DEBUG_FOCUS)
t_start = Get_Time( );
#endif
cuda_memset( spad, 0, sizeof(real) * 3 * system->n,
"Compute_Bonded_Forces::spad" );
Cuda_Atom_Energy_Part1 <<< control->blocks, control->block_size >>>
( system->d_my_atoms, system->reax_param.d_gp,
system->reax_param.d_sbp, system->reax_param.d_tbp, *(workspace->d_workspace),
*(lists[BONDS]), system->n, system->reax_param.num_atom_types,
spad, &spad[system->n], &spad[2 * system->n] );
cudaDeviceSynchronize( );
cudaCheckError( );
Cuda_Atom_Energy_Part2 <<< control->blocks, control->block_size >>>
( *(lists[BONDS]), *(workspace->d_workspace), system->n );
cudaDeviceSynchronize( );
cudaCheckError( );
/* reduction for E_Lp */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_lp,
system->n );
}
/* reduction for E_Ov */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( &spad[system->n],
&((simulation_data *)data->d_simulation_data)->my_en.e_ov,
system->n );
}
/* reduction for E_Un */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( &spad[2 * system->n],
&((simulation_data *)data->d_simulation_data)->my_en.e_un,
system->n );
}
#if defined(DEBUG_FOCUS)
t_elapsed = Get_Timing_Info( t_start );
fprintf( stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n",
cudaGetLastError( ), t_elapsed );
fprintf( stderr, "test_LonePair_postprocess Done... \n");
#endif
compute_bonded_part1 = TRUE;
}
/* 4. Valence Angles Interactions */
#if defined(DEBUG_FOCUS)
t_start = Get_Time( );
#endif
cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
sizeof(int) * system->total_bonds,
"Cuda_Compute_Bonded_Forces::workspace->scratch" );
thbody = (int *) workspace->scratch;
ret = Cuda_Estimate_Storage_Three_Body( system, control, workspace,
data->step, lists, thbody );
#if defined(DEBUG_FOCUS)
fprintf( stderr, "system->total_thbodies = %d, lists:THREE_BODIES->max_intrs = %d,\n",
system->total_thbodies, lists[THREE_BODIES]->max_intrs );
fprintf( stderr, "lists:THREE_BODIES->n = %d, lists:BONDS->max_intrs = %d,\n",
lists[THREE_BODIES]->n, lists[BONDS]->max_intrs );
fprintf( stderr, "system->total_thbodies = %d\n", system->total_thbodies );
#endif
if ( ret == SUCCESS )
{
Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices, lists );
cuda_memset( spad, 0, (sizeof(real) * 6 + sizeof(rvec) * 2) * system->N,
"Cuda_Compute_Bonded_Forces::spad" );
Cuda_Valence_Angles_Part1 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, system->reax_param.d_gp,
system->reax_param.d_sbp, system->reax_param.d_thbp,
(control_params *)control->d_control_params,
*(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]),
system->n, system->N, system->reax_param.num_atom_types,
spad, &spad[2 * system->N], &spad[4 * system->N], (rvec *)(&spad[6 * system->N]) );
cudaDeviceSynchronize( );
cudaCheckError( );
/* reduction for E_Ang */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
system->N );
}
/* reduction for E_Pen */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( &spad[2 * system->N],
&((simulation_data *)data->d_simulation_data)->my_en.e_pen,
system->N );
}
/* reduction for E_Coa */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( &spad[4 * system->N],
&((simulation_data *)data->d_simulation_data)->my_en.e_coa,
system->N );
}
/* reduction for ext_pres */
rvec_spad = (rvec *) (&spad[6 * system->N]);
k_reduction_rvec <<< control->blocks_n, control->block_size, sizeof(rvec) * control->block_size >>>
( rvec_spad, rvec_spad + system->N, system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
k_reduction_rvec <<< 1, control->blocks_pow_2_n, sizeof(rvec) * control->blocks_pow_2_n >>>
( rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks_n );
cudaDeviceSynchronize ();
cudaCheckError( );
// Cuda_Reduction_Sum( rvec_spad,
// &((simulation_data *)data->d_simulation_data)->my_ext_press,
// system->N );
Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, (control_params *)control->d_control_params,
*(workspace->d_workspace), *(lists[BONDS]), system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
#if defined(DEBUG_FOCUS)
t_elapsed = Get_Timing_Info( t_start );
fprintf( stderr, "Three_Body_Interactions ... Timing %lf \n",
t_elapsed );
fprintf( stderr, "Three_Body_Interactions Done... \n" );
#endif
/* 5. Torsion Angles Interactions */
#if defined(DEBUG_FOCUS)
t_start = Get_Time( );
#endif
cuda_memset( spad, 0, (sizeof(real) * 4 + sizeof(rvec) * 2) * system->n,
"Cuda_Compute_Bonded_Forces::spad" );
Cuda_Torsion_Angles_Part1 <<< control->blocks, control->block_size >>>
( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_fbp,
(control_params *) control->d_control_params, *(lists[BONDS]),
*(lists[THREE_BODIES]), *(workspace->d_workspace), system->n,
system->reax_param.num_atom_types,
spad, &spad[2 * system->n], (rvec *) (&spad[4 * system->n]) );
cudaDeviceSynchronize( );
cudaCheckError( );
/* reduction for E_Tor */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
system->n );
}
/* reduction for E_Con */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( &spad[2 * system->n],
&((simulation_data *)data->d_simulation_data)->my_en.e_con,
system->n );
}
/* reduction for ext_pres */
rvec_spad = (rvec *) (&spad[4 * system->n]);
k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * control->block_size >>>
( rvec_spad, rvec_spad + system->n, system->n );
cudaDeviceSynchronize( );
cudaCheckError( );
k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
( rvec_spad + system->n,
&((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks );
cudaDeviceSynchronize( );
cudaCheckError( );
// Cuda_Reduction_Sum( rvec_spad,
// &((simulation_data *)data->d_simulation_data)->my_ext_press,
// system->n );
Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]),
system->N );
cudaDeviceSynchronize( );
cudaCheckError( );
#if defined(DEBUG_FOCUS)
t_elapsed = Get_Timing_Info( t_start );
fprintf( stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n",
cudaGetLastError( ), t_elapsed );
fprintf( stderr, " Four_Body_ Done... \n");
#endif
/* 6. Hydrogen Bonds Interactions */
if ( control->hbond_cut > 0.0 && system->numH > 0 )
{
#if defined(DEBUG_FOCUS)
t_start = Get_Time( );
#endif
cuda_memset( spad, 0, (sizeof(real) + sizeof(rvec)) * 2 * system->n,
"Cuda_Compute_Bonded_Forces::spad" );
// hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) +
// (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
Cuda_Hydrogen_Bonds_Part1 <<< control->blocks, control->block_size >>>
// Cuda_Hydrogen_Bonds_Part1_opt <<< hbs, HB_BLOCK_SIZE,
// HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>>
( system->d_my_atoms, system->reax_param.d_sbp,
system->reax_param.d_hbp, system->reax_param.d_gp,
(control_params *) control->d_control_params,
*(workspace->d_workspace),
*(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]),
system->n, system->reax_param.num_atom_types,
spad, (rvec *) (&spad[2 * system->n]), system->my_rank, data->step );
cudaDeviceSynchronize( );
cudaCheckError( );
/* reduction for E_HB */
if ( update_energy == TRUE )
{
Cuda_Reduction_Sum( spad,
&((simulation_data *)data->d_simulation_data)->my_en.e_hb,
system->n );
}
/* reduction for ext_pres */
rvec_spad = (rvec *) (&spad[2 * system->n]);
k_reduction_rvec <<< control->blocks, control->block_size, sizeof(rvec) * control->block_size >>>
(rvec_spad, rvec_spad + system->n, system->n);
cudaDeviceSynchronize( );
cudaCheckError( );
k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
( rvec_spad + system->n,
&((simulation_data *)data->d_simulation_data)->my_ext_press,
control->blocks );
cudaDeviceSynchronize( );
cudaCheckError( );
// Cuda_Reduction_Sum( rvec_spad,
// &((simulation_data *)data->d_simulation_data)->my_ext_press,
// system->n );
Cuda_Hydrogen_Bonds_Part2 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, *(workspace->d_workspace),
*(lists[BONDS]), system->n );
cudaDeviceSynchronize( );
cudaCheckError( );
// hnbrs_blocks = (system->n * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
// (((system->n * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
Cuda_Hydrogen_Bonds_Part3 <<< control->blocks_n, control->block_size >>>
( system->d_my_atoms, *(workspace->d_workspace), *(lists[HBONDS]), system->n );
// Cuda_Hydrogen_Bonds_Part3_opt <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE,
// HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
// ( system->d_my_atoms, *(workspace->d_workspace), *(lists[HBONDS]), system->n );
cudaDeviceSynchronize( );
cudaCheckError( );
#if defined(DEBUG_FOCUS)
t_elapsed = Get_Timing_Info( t_start );
fprintf( stderr,
"Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n",
cudaGetLastError( ), t_elapsed );
fprintf( stderr, "Hydrogen_Bond Done... \n" );
#endif
}
compute_bonded_part1 = FALSE;
}
return ret;
}
void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control,
simulation_data *data, storage *workspace,
reax_list **lists, output_controls *out_control,
mpi_datatypes *mpi_data )
{
Cuda_NonBonded_Energy( system, control, workspace, data, lists, out_control );
}
void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
simulation_data *data, storage *workspace,
reax_list **lists, mpi_datatypes *mpi_data )
{
rvec *f;
check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
sizeof(rvec) * system->N,
"Cuda_Compute_Total_Force::workspace->host_scratch" );
f = (rvec *) workspace->host_scratch;
memset( f, 0, sizeof(rvec) * system->N );
Cuda_Total_Forces( system, control, data, workspace, lists );
#if defined(PURE_REAX)
/* now all forces are computed to their partially-final values
* based on the neighbors information each processor has had.
* final values of force on each atom needs to be computed by adding up
* all partially-final pieces */
copy_host_device( f, workspace->d_workspace->f, sizeof(rvec) * system->N ,
cudaMemcpyDeviceToHost, "Cuda_Compute_Total_Force::workspace->d_workspace->f" );
Coll( system, mpi_data, f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
copy_host_device( f, workspace->d_workspace->f, sizeof(rvec) * system->N,
cudaMemcpyHostToDevice, "Cuda_Compute_Total_Force::workspace->d_workspace->f" );
Cuda_Total_Forces_PURE( system, workspace );
#endif
}
int Cuda_Compute_Forces( reax_system *system, control_params *control,
simulation_data *data, storage *workspace, reax_list **lists,
output_controls *out_control, mpi_datatypes *mpi_data )
{
int charge_flag, retVal;
static int init_forces_done = FALSE;
#if defined(LOG_PERFORMANCE)
real t_start = 0;
if ( system->my_rank == MASTER_NODE )
{
t_start = Get_Time( );
}
#endif
retVal = SUCCESS;
if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
{
charge_flag = TRUE;
}
else
{
charge_flag = FALSE;
}
if ( init_forces_done == FALSE )
{
if ( charge_flag == TRUE )
{
retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
}
else
{
retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
}
if ( retVal == SUCCESS )
{
init_forces_done = TRUE;
}
}
if ( retVal == SUCCESS )
{
#if defined(LOG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &data->timing.init_forces );
}
#endif
retVal = Cuda_Compute_Bonded_Forces( system, control, data,
workspace, lists, out_control );
#if defined(LOG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &data->timing.bonded );
}
#endif
}
if ( retVal == SUCCESS )
{
#if defined(PURE_REAX)
if ( charge_flag == TRUE )
{
Cuda_Compute_Charges( system, control, data, workspace, out_control, mpi_data );
}
#if defined(LOG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &data->timing.cm );
}
#endif
#endif //PURE_REAX
Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
lists, out_control, mpi_data );
#if defined(LOG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &data->timing.nonb );
}
#endif
Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
#if defined(LOG_PERFORMANCE)
if ( system->my_rank == MASTER_NODE )
{
Update_Timing_Info( &t_start, &data->timing.bonded );
}
#endif
init_forces_done = FALSE;
}
return retVal;
}