From 68a1ce1999edb5c25f9d5f96bd93f160bfeb7e9b Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Wed, 5 Aug 2020 21:24:33 -0400
Subject: [PATCH] PG-PuReMD: rework custom reduction functions to use less
 shared memory and to have the correct thread and block counts. Fix bug in
 charge matrix initialization (under-allocated space previously). Split charge
 matrix and bonds/hydrogen bonds memory management routines to mirror the
 similar splitting of the initialization routines. Code clean-up related to
 half vs. full list and sparse matrix formats.

---
 PG-PuReMD/src/cuda/cuda_bond_orders.cu    |   4 +-
 PG-PuReMD/src/cuda/cuda_bonds.cu          |   6 +-
 PG-PuReMD/src/cuda/cuda_charges.cu        |  16 +-
 PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu  |  14 +-
 PG-PuReMD/src/cuda/cuda_environment.cu    |  12 +-
 PG-PuReMD/src/cuda/cuda_forces.cu         | 417 +++++++++++++++-------
 PG-PuReMD/src/cuda/cuda_init_md.cu        |   6 +-
 PG-PuReMD/src/cuda/cuda_nonbonded.cu      |  59 +--
 PG-PuReMD/src/cuda/cuda_reduction.cu      |   4 +-
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu   | 224 +++++++-----
 PG-PuReMD/src/cuda/cuda_system_props.cu   | 230 ++++++------
 PG-PuReMD/src/cuda/cuda_torsion_angles.cu |  12 +-
 12 files changed, 600 insertions(+), 404 deletions(-)

diff --git a/PG-PuReMD/src/cuda/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
index 64c94417..0c11e58f 100644
--- a/PG-PuReMD/src/cuda/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -773,12 +773,12 @@ void Cuda_Total_Forces( reax_system *system, control_params *control,
     if ( control->virial != 0 )
     {
         /* reduction for ext_press */
-        k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * DEF_BLOCK_SIZE >>> 
+        k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec) * (DEF_BLOCK_SIZE / 32) >>> 
             ( spad_rvec, &spad_rvec[system->N], system->N );
         cudaDeviceSynchronize( ); 
         cudaCheckError( ); 
 
-        k_reduction_rvec <<< 1, control->blocks_pow_2_n, sizeof(rvec) * control->blocks_pow_2_n>>>
+        k_reduction_rvec <<< 1, ((blocks + 31) / 32) * 32, sizeof(rvec) * ((blocks + 31) / 32) >>>
             ( &spad_rvec[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks );
         cudaDeviceSynchronize( ); 
         cudaCheckError( ); 
diff --git a/PG-PuReMD/src/cuda/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
index bae45dbc..9cd4e0a4 100644
--- a/PG-PuReMD/src/cuda/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -85,7 +85,7 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
             ebond = -twbp->De_s * bo_ij->BO_s * exp_be12
                 - twbp->De_p * bo_ij->BO_pi
                 - twbp->De_pp * bo_ij->BO_pi2;
-            e_bond[ i ] += ebond;
+            e_bond[i] += ebond;
 
             /* calculate derivatives of bond orders */
             bo_ij->Cdbo += CEbo;
@@ -111,8 +111,8 @@ CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp,
             /* Stabilisation terminal triple bond */
             if ( bo_ij->BO >= 1.00 )
             {
-                if ( gp37 == 2 ||
-                        ( (Cuda_strncmp( sbp_i->name, "C", sizeof(sbp_i->name) ) == 0
+                if ( gp37 == 2
+                        || ( (Cuda_strncmp( sbp_i->name, "C", sizeof(sbp_i->name) ) == 0
                             && Cuda_strncmp( sbp_j->name, "O", sizeof(sbp_j->name) ) == 0)
                         || (Cuda_strncmp( sbp_i->name, "O", sizeof(sbp_i->name) ) == 0
                             && Cuda_strncmp( sbp_j->name, "C", sizeof(sbp_j->name) ) == 0) ) )
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index e063ed9a..ce45043e 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -492,24 +492,26 @@ static void Calculate_Charges_QEq( reax_system const * const system,
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(rvec2) * 2 * system->n,
+            sizeof(rvec2) * (blocks + 1),
             "Calculate_Charges_QEq::workspace->scratch" );
     spad_rvec2 = (rvec2 *) workspace->scratch;
-    cuda_memset( spad_rvec2, 0, sizeof(rvec2) * 2 * system->n,
-            "Calculate_Charges_QEq::spad_rvec2," );
+    cuda_memset( spad_rvec2, 0, sizeof(rvec2) * (blocks + 1),
+            "Calculate_Charges_QEq::spad_rvec2" );
 
     /* compute local sums of pseudo-charges in s and t on device */
-    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
+                      sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>>
         ( workspace->d_workspace->x, spad_rvec2, system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    k_reduction_rvec2 <<< 1, control->blocks_pow_2, sizeof(rvec2) * control->blocks_pow_2 >>>
-        ( spad_rvec2, &spad_rvec2[system->n], blocks );
+    k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32,
+                      sizeof(rvec2) * ((blocks + 31) / 32) >>>
+        ( spad_rvec2, &spad_rvec2[blocks], blocks );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    copy_host_device( &my_sum, &spad_rvec2[system->n],
+    copy_host_device( &my_sum, &spad_rvec2[blocks],
             sizeof(rvec2), cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," );
 #else
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
index a957f939..7a0354c8 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
@@ -69,7 +69,7 @@ CUDA_GLOBAL void k_vector_copy_from_rvec2( real * const dst, rvec2 const * const
 
 
 CUDA_GLOBAL void k_vector_copy_to_rvec2( rvec2 * const dst, real const * const src,
-        int index, int n)
+        int index, int n )
 {
     int i;
 
@@ -631,17 +631,19 @@ void Dot_local_rvec2( control_params const * const control,
             sizeof(rvec2) * (k + blocks + 1), "Dot_local_rvec2::workspace->scratch" );
     spad = (rvec2 *) workspace->scratch;
 
-    Vector_Mult_rvec2( &spad[0], v1, v2, k );
+    Vector_Mult_rvec2( spad, v1, v2, k );
 
     /* local reduction (sum) on device */
-//    Cuda_Reduction_Sum( &spad[0], &spad[k], k );
+//    Cuda_Reduction_Sum( spad, &spad[k], k );
 
-    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        ( &spad[0], &spad[k], k );
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
+                      sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( spad, &spad[k], k );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    k_reduction_rvec2 <<< 1, control->blocks_pow_2, sizeof(rvec2) * control->blocks_pow_2 >>>
+    k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32,
+                      sizeof(rvec2) * ((blocks + 31) / 32) >>>
         ( &spad[k], &spad[k + blocks], blocks );
     cudaDeviceSynchronize( );
     cudaCheckError( );
diff --git a/PG-PuReMD/src/cuda/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu
index 98da574e..e2045a0e 100644
--- a/PG-PuReMD/src/cuda/cuda_environment.cu
+++ b/PG-PuReMD/src/cuda/cuda_environment.cu
@@ -3,16 +3,16 @@
 #include "cuda_utils.h"
 
 
-static void compute_blocks( int *blocks, int *block_size, int count )
+static void compute_blocks( int *blocks, int *block_size, int threads )
 {
     *block_size = DEF_BLOCK_SIZE; // threads per block
-    *blocks = (int) CEIL( (double) count / DEF_BLOCK_SIZE ); // blocks per grid
+    *blocks = (threads + (DEF_BLOCK_SIZE - 1)) / DEF_BLOCK_SIZE; // blocks per grid
 }
 
 
-static void compute_nearest_pow_2( int blocks, int *result )
+static void compute_nearest_multiple_32( int blocks, int *result )
 {
-  *result = (int) EXP2( CEIL( LOG2((double) blocks) ) );
+    *result = ((blocks + 31) / 32) * 32;
 }
 
 
@@ -50,10 +50,10 @@ extern "C" void Cuda_Init_Block_Sizes( reax_system *system,
         control_params *control )
 {
     compute_blocks( &control->blocks, &control->block_size, system->n );
-    compute_nearest_pow_2( control->blocks, &control->blocks_pow_2 );
+    compute_nearest_multiple_32( control->blocks, &control->blocks_pow_2 );
 
     compute_blocks( &control->blocks_n, &control->block_size_n, system->N );
-    compute_nearest_pow_2( control->blocks_n, &control->blocks_pow_2_n );
+    compute_nearest_multiple_32( control->blocks_n, &control->blocks_pow_2_n );
 }
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index aba8c39f..bba38c8b 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -264,7 +264,7 @@ CUDA_GLOBAL void k_init_distance( reax_atom *my_atoms, reax_list far_nbr_list, i
  * the full shell communication method */
 CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters *sbp, 
         two_body_parameters *tbp, storage workspace, control_params *control, 
-        reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types,
+        reax_list far_nbr_list, int num_atom_types,
         int *max_cm_entries, int *realloc_cm_entries )
 {
     int i, j, pj;
@@ -288,14 +288,14 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters
     H = &workspace.H;
     cm_top = H->start[i];
 
-    atom_i = &my_atoms[i];
-    type_i = atom_i->type;
-    start_i = Start_Index( i, &far_nbr_list );
-    end_i = End_Index( i, &far_nbr_list );
-    sbp_i = &sbp[type_i];
-
     if ( i < H->n )
     {
+        atom_i = &my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+        sbp_i = &sbp[type_i];
+
         /* diagonal entry in the matrix */
         H->j[cm_top] = i;
         H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control,
@@ -316,21 +316,90 @@ CUDA_GLOBAL void k_init_cm_full_fs( reax_atom *my_atoms, single_body_parameters
                 r_ij = far_nbr_list.far_nbr_list.d[pj];
 
                 H->j[cm_top] = j;
+                H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
+                        control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL );
+                ++cm_top;
+            }
+        }
+    }
 
-                if ( control->tabulate == 0 )
-                {
-                    H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap,
-                            control, i, H->j[cm_top], r_ij, twbp->gamma, OFF_DIAGONAL );
-                }
-                else
-                {
-                    H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij, type_i, type_j,num_atom_types );
-                }
+    __syncthreads( );
+
+    H->end[i] = cm_top;
+    num_cm_entries = cm_top - H->start[i];
+
+    /* reallocation checks */
+    if ( num_cm_entries > max_cm_entries[i] )
+    {
+        *realloc_cm_entries = TRUE;
+    }
+}
+
+
+/* Compute the tabulated charge matrix entries and store the matrix in full format
+ * using the far neighbors list (stored in full format) and according to
+ * the full shell communication method */
+CUDA_GLOBAL void k_init_cm_full_fs_tab( reax_atom *my_atoms, single_body_parameters *sbp, 
+        two_body_parameters *tbp, storage workspace, control_params *control, 
+        reax_list far_nbr_list, LR_lookup_table *t_LR, int num_atom_types,
+        int *max_cm_entries, int *realloc_cm_entries )
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int cm_top;
+    int num_cm_entries;
+    real r_ij;
+    single_body_parameters *sbp_i;
+    reax_atom *atom_i, *atom_j;
+    sparse_matrix *H;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= workspace.H.n_max )
+    {
+        return;
+    }
+
+    H = &workspace.H;
+    cm_top = H->start[i];
+
+    if ( i < H->n )
+    {
+        atom_i = &my_atoms[i];
+        type_i = atom_i->type;
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+        sbp_i = &sbp[type_i];
+
+        /* diagonal entry in the matrix */
+        H->j[cm_top] = i;
+        H->val[cm_top] = Init_Charge_Matrix_Entry( sbp_i, workspace.Tap, control,
+                i, i, 0.0, 0.0, DIAGONAL );
+        ++cm_top;
+
+        /* update i-j distance - check if j is within cutoff */
+        for ( pj = start_i; pj < end_i; ++pj )
+        {
+            j = far_nbr_list.far_nbr_list.nbr[pj];
+
+            if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                atom_j = &my_atoms[j];
+                type_j = atom_j->type;
+
+                r_ij = far_nbr_list.far_nbr_list.d[pj];
+
+                H->j[cm_top] = j;
+                H->val[cm_top] = Init_Charge_Matrix_Entry_Tab( t_LR, r_ij,
+                        type_i, type_j, num_atom_types );
                 ++cm_top;
             }
         }
     }
 
+    __syncthreads( );
+
     H->end[i] = cm_top;
     num_cm_entries = cm_top - H->start[i];
 
@@ -539,26 +608,66 @@ CUDA_GLOBAL void k_init_bonds( reax_atom *my_atoms, single_body_parameters *sbp,
 }
 
 
-CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control,
+        reax_list far_nbr_list, int cm_n, int cm_n_max,
+        int *cm_entries, int *max_cm_entries )
+{
+    int i, pj; 
+    int start_i, end_i;
+    int num_cm_entries;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= cm_n_max )
+    {
+        return;
+    }
+
+    num_cm_entries = 0;
+
+    if ( i < cm_n )
+    {
+        start_i = Start_Index( i, &far_nbr_list );
+        end_i = End_Index( i, &far_nbr_list );
+
+        /* diagonal entry */
+        ++num_cm_entries;
+
+        for ( pj = start_i; pj < end_i; ++pj )
+        { 
+            if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
+            {
+                ++num_cm_entries;
+            }
+        }
+    }
+
+    __syncthreads( );
+
+    cm_entries[i] = num_cm_entries;
+    max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
+}
+
+
+CUDA_GLOBAL void k_estimate_storages_bonds( reax_atom *my_atoms, 
         single_body_parameters *sbp, two_body_parameters *tbp,
         control_params *control, reax_list far_nbr_list, 
-        int num_atom_types, int n, int N, int total_cap, int cm_n_max,
+        int num_atom_types, int n, int N, int total_cap,
         int *bonds, int *max_bonds,
-        int *hbonds, int *max_hbonds,
-        int *cm_entries, int *max_cm_entries )
+        int *hbonds, int *max_hbonds  )
 {
     int i, j, pj; 
     int start_i, end_i;
     int type_i, type_j;
     int ihb, jhb;
-    int num_bonds, num_hbonds, num_cm_entries;
+    int num_bonds, num_hbonds;
     real cutoff;
     real r_ij; 
     real C12, C34, C56;
     real BO, BO_s, BO_pi, BO_pi2;
     single_body_parameters *sbp_i, *sbp_j;
     two_body_parameters *twbp;
-    reax_atom *atom_i, *atom_j;
+    reax_atom *atom_i;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -569,7 +678,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
 
     num_bonds = 0;
     num_hbonds = 0;
-    num_cm_entries = 0;
 
     if ( i < N )
     {
@@ -583,8 +691,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
         if ( i < n )
         { 
             cutoff = control->nonb_cut;
-            /* diagonal entry */
-            ++num_cm_entries;
         }   
         else
         {
@@ -594,7 +700,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
         for ( pj = start_i; pj < end_i; ++pj )
         { 
             j = far_nbr_list.far_nbr_list.nbr[pj];
-            atom_j = &my_atoms[j];
 
             if ( far_nbr_list.far_nbr_list.d[pj] <= control->nonb_cut )
             {
@@ -603,12 +708,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
                 ihb = sbp_i->p_hbond;
                 jhb = sbp_j->p_hbond;
 
-                //TODO: assuming far_nbr_list in FULL_LIST, add conditions for HALF_LIST
-                if ( i < n )
-                {
-                    ++num_cm_entries;
-                }
-
                 /* atom i: H bonding, ghost
                  * atom j: H atom, native */
                 if ( control->hbond_cut > 0.0 && i >= n && j < n
@@ -705,12 +804,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms,
 
     hbonds[i] = num_hbonds;
     max_hbonds[i] = MAX( (int) CEIL(num_hbonds * SAFE_ZONE), MIN_HBONDS );
-
-    if ( i < cm_n_max )
-    {
-        cm_entries[i] = num_cm_entries;
-        max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
-    }
 }
 
 
@@ -1098,20 +1191,51 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 {
     int blocks;
 
-    blocks = system->total_cap / DEF_BLOCK_SIZE
-        + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
+    if ( realloc_cm == TRUE )
+    {
+        blocks = workspace->d_workspace->H.n_max / DEF_BLOCK_SIZE
+            + (workspace->d_workspace->H.n_max % DEF_BLOCK_SIZE == 0 ? 0 : 1);
 
-    k_estimate_storages <<< blocks, DEF_BLOCK_SIZE >>>
-        ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
-          (control_params *) control->d_control_params,
-          *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
-          system->n, system->N, system->total_cap,
-          workspace->d_workspace->H.n_max,
-          system->d_bonds, system->d_max_bonds,
-          system->d_hbonds, system->d_max_hbonds,
-          system->d_cm_entries, system->d_max_cm_entries );
-    cudaDeviceSynchronize( );
-    cudaCheckError( );
+        if ( workspace->d_workspace->H.format == SYM_HALF_MATRIX )
+        {
+//            k_estimate_storages_cm_half <<< blocks, DEF_BLOCK_SIZE >>>
+//                ( (control_params *) control->d_control_params,
+//                  *(lists[FAR_NBRS]), workspace->d_workspace->H.n,
+//                  workspace->d_workspace->H.n_max,
+//                  system->d_cm_entries, system->d_max_cm_entries );
+        }
+        else
+        {
+            k_estimate_storages_cm_full <<< blocks, DEF_BLOCK_SIZE >>>
+                ( (control_params *) control->d_control_params,
+                  *(lists[FAR_NBRS]), workspace->d_workspace->H.n,
+                  workspace->d_workspace->H.n_max,
+                  system->d_cm_entries, system->d_max_cm_entries );
+        }
+        cudaDeviceSynchronize( );
+        cudaCheckError( );
+
+        Cuda_Reduction_Sum( system->d_max_cm_entries,
+                system->d_total_cm_entries, workspace->d_workspace->H.n_max );
+        copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int),
+                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
+    }
+
+    if ( realloc_bonds == TRUE || realloc_hbonds == TRUE )
+    {
+        blocks = system->total_cap / DEF_BLOCK_SIZE
+            + (system->total_cap % DEF_BLOCK_SIZE == 0 ? 0 : 1);
+
+        k_estimate_storages_bonds <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
+              (control_params *) control->d_control_params,
+              *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
+              system->n, system->N, system->total_cap,
+              system->d_bonds, system->d_max_bonds,
+              system->d_hbonds, system->d_max_hbonds );
+        cudaDeviceSynchronize( );
+        cudaCheckError( );
+    }
 
     if ( realloc_bonds == TRUE )
     {
@@ -1143,15 +1267,7 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 #endif
 
         control->hbond_cut = 0.0;
-        k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *)control->d_control_params );
-    }
-
-    if ( realloc_cm == TRUE )
-    {
-        Cuda_Reduction_Sum( system->d_max_cm_entries,
-                system->d_total_cm_entries, workspace->d_workspace->H.n_max );
-        copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int),
-                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
+        k_disable_hydrogen_bonding <<< 1, 1 >>> ( (control_params *) control->d_control_params );
     }
 }
 
@@ -1222,11 +1338,22 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         }
         else
         {
-            k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE >>>
-                ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
-                  *(workspace->d_workspace), (control_params *) control->d_control_params,
-                  *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types,
-                  system->d_max_cm_entries, system->d_realloc_cm_entries );
+            if ( control->tabulate <= 0 )
+            {
+                k_init_cm_full_fs <<< blocks, DEF_BLOCK_SIZE >>>
+                    ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
+                      *(workspace->d_workspace), (control_params *) control->d_control_params,
+                      *(lists[FAR_NBRS]), system->reax_param.num_atom_types,
+                      system->d_max_cm_entries, system->d_realloc_cm_entries );
+            }
+            else
+            {
+                k_init_cm_full_fs_tab <<< blocks, DEF_BLOCK_SIZE >>>
+                    ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp,
+                      *(workspace->d_workspace), (control_params *) control->d_control_params,
+                      *(lists[FAR_NBRS]), workspace->d_LR, system->reax_param.num_atom_types,
+                      system->d_max_cm_entries, system->d_realloc_cm_entries );
+            }
             cudaDeviceSynchronize( );
             cudaCheckError( );
         }
@@ -1340,11 +1467,11 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 #endif
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            MAX( sizeof(real) * 2 * system->N,
+            MAX( sizeof(real) * system->n,
                 MAX( sizeof(real) * 3 * system->n,
-                    MAX( (sizeof(real) * 6 + sizeof(rvec) * 2) * system->N + sizeof(rvec) * control->blocks_n,
-                        MAX( (sizeof(real) * 4 + sizeof(rvec) * 2) * system->n + sizeof(rvec) * control->blocks,
-                            (sizeof(real) + sizeof(rvec)) * 2 * system->n + sizeof(rvec) * control->blocks )))),
+                    MAX( (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n,
+                        MAX( (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
+                            (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks )))),
             "Cuda_Compute_Bonded_Forces::workspace->scratch" );
     spad = (real *) workspace->scratch;
     update_energy = (out_control->energy_update_freq > 0
@@ -1380,7 +1507,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         cudaCheckError( );
 
         /* 2. Bond Energy Interactions */
-        cuda_memset( spad, 0, sizeof(real) * 2 * system->N,
+        cuda_memset( spad, 0, sizeof(real) * system->n,
                 "Compute_Bonded_Forces::spad" );
 
         Cuda_Bonds <<< control->blocks, control->block_size, sizeof(real) * control->block_size >>>
@@ -1438,7 +1565,10 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
             sizeof(int) * system->total_bonds,
             "Cuda_Compute_Bonded_Forces::workspace->scratch" );
+
     thbody = (int *) workspace->scratch;
+    spad = (real *) workspace->scratch; /* in case scratch gets reallocated above, changing the pointer */
+
     ret = Cuda_Estimate_Storage_Three_Body( system, control, data, workspace,
             lists, thbody );
 
@@ -1446,63 +1576,69 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     {
         Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies_indices, lists );
 
-        cuda_memset( spad, 0, (sizeof(real) * 6 + sizeof(rvec) * 2) * system->N,
+        cuda_memset( spad, 0,
+                (sizeof(real) * 3 + sizeof(rvec)) * system->N + sizeof(rvec) * control->blocks_n,
                 "Cuda_Compute_Bonded_Forces::spad" );
 
         Cuda_Valence_Angles_Part1 <<< control->blocks_n, control->block_size_n >>>
             ( system->d_my_atoms, system->reax_param.d_gp, 
               system->reax_param.d_sbp, system->reax_param.d_thbp, 
-              (control_params *)control->d_control_params,
+              (control_params *) control->d_control_params,
               *(workspace->d_workspace), *(lists[BONDS]), *(lists[THREE_BODIES]),
               system->n, system->N, system->reax_param.num_atom_types, 
-              spad, &spad[2 * system->N], &spad[4 * system->N], (rvec *)(&spad[6 * system->N]) );
+              spad, &spad[system->N], &spad[2 * system->N], (rvec *) (&spad[3 * system->N]) );
         cudaDeviceSynchronize( );
         cudaCheckError( );
 
-        /* reduction for E_Ang */
         if ( update_energy == TRUE )
         {
-            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
+            /* reduction for E_Ang */
+            Cuda_Reduction_Sum( spad,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_ang,
                     system->N );
-        }
 
-        if ( update_energy == TRUE )
-        {
             /* reduction for E_Pen */
-            Cuda_Reduction_Sum( &spad[2 * system->N],
+            Cuda_Reduction_Sum( &spad[system->N],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_pen,
                     system->N );
 
             /* reduction for E_Coa */
-            Cuda_Reduction_Sum( &spad[4 * system->N],
+            Cuda_Reduction_Sum( &spad[2 * system->N],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_coa,
                     system->N );
         }
 
-        /* reduction for ext_pres */
-        rvec_spad = (rvec *) (&spad[6 * system->N]);
-        k_reduction_rvec <<< control->blocks_n, control->block_size_n,
-                         sizeof(rvec) * control->block_size_n >>>
-            ( rvec_spad, &rvec_spad[system->N], system->N );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
+        if ( control->virial == 1 )
+        {
+            rvec_spad = (rvec *) (&spad[3 * system->N]);
 
-        k_reduction_rvec <<< 1, control->blocks_pow_2_n, sizeof(rvec) * control->blocks_pow_2_n >>>
-            ( &rvec_spad[system->N], &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks_n );
-        cudaDeviceSynchronize ();
-        cudaCheckError( );
-//        Cuda_Reduction_Sum( rvec_spad,
-//                &((simulation_data *)data->d_simulation_data)->my_ext_press,
-//                system->N );
+            /* reduction for ext_pres */
+            k_reduction_rvec <<< control->blocks_n, control->block_size_n,
+                             sizeof(rvec) * (control->block_size_n / 32) >>>
+                ( rvec_spad, &rvec_spad[system->N], system->N );
+            cudaDeviceSynchronize( );
+            cudaCheckError( );
+
+            k_reduction_rvec <<< 1, control->blocks_pow_2_n,
+                             sizeof(rvec) * (control->blocks_pow_2_n / 32) >>>
+                ( &rvec_spad[system->N],
+                  &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                  control->blocks_n );
+            cudaDeviceSynchronize ();
+            cudaCheckError( );
+//            Cuda_Reduction_Sum( rvec_spad,
+//                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
+//                    system->N );
+        }
 
         Cuda_Valence_Angles_Part2 <<< control->blocks_n, control->block_size_n >>>
-            ( system->d_my_atoms, (control_params *)control->d_control_params,
+            ( system->d_my_atoms, (control_params *) control->d_control_params,
               *(workspace->d_workspace), *(lists[BONDS]), system->N );
         cudaDeviceSynchronize( );
         cudaCheckError( );
 
         /* 5. Torsion Angles Interactions */
-        cuda_memset( spad, 0, (sizeof(real) * 4 + sizeof(rvec) * 2) * system->n,
+        cuda_memset( spad, 0, (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
                 "Cuda_Compute_Bonded_Forces::spad" );
 
         Cuda_Torsion_Angles_Part1 <<< control->blocks, control->block_size >>>
@@ -1510,40 +1646,47 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               (control_params *) control->d_control_params, *(lists[BONDS]),
               *(lists[THREE_BODIES]), *(workspace->d_workspace), system->n,
               system->reax_param.num_atom_types, 
-              spad, &spad[2 * system->n], (rvec *) (&spad[4 * system->n]) );
+              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
         cudaDeviceSynchronize( );
         cudaCheckError( );
 
         if ( update_energy == TRUE )
         {
             /* reduction for E_Tor */
-            Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
+            Cuda_Reduction_Sum( spad,
+                    &((simulation_data *)data->d_simulation_data)->my_en.e_tor,
                     system->n );
 
             /* reduction for E_Con */
-            Cuda_Reduction_Sum( &spad[2 * system->n],
+            Cuda_Reduction_Sum( &spad[system->n],
                     &((simulation_data *)data->d_simulation_data)->my_en.e_con,
                     system->n );
         }
 
-        /* reduction for ext_pres */
-        rvec_spad = (rvec *) (&spad[4 * system->n]);
-        k_reduction_rvec <<< control->blocks, control->block_size,
-                         sizeof(rvec) * control->block_size >>>
-            ( rvec_spad, &rvec_spad[system->n], system->n );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
+        if ( control->virial == 1 )
+        {
+            rvec_spad = (rvec *) (&spad[2 * system->n]);
 
-        k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
-                ( &rvec_spad[system->n],
-                  &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks );
-        cudaDeviceSynchronize( );
-        cudaCheckError( );
-//        Cuda_Reduction_Sum( rvec_spad,
-//                &((simulation_data *)data->d_simulation_data)->my_ext_press,
-//                system->n );
+            /* reduction for ext_pres */
+            k_reduction_rvec <<< control->blocks, control->block_size,
+                             sizeof(rvec) * (control->block_size / 32) >>>
+                ( rvec_spad, &rvec_spad[system->n], system->n );
+            cudaDeviceSynchronize( );
+            cudaCheckError( );
+
+            k_reduction_rvec <<< 1, control->blocks_pow_2,
+                             sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
+                    ( &rvec_spad[system->n],
+                      &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                      control->blocks );
+            cudaDeviceSynchronize( );
+            cudaCheckError( );
+//            Cuda_Reduction_Sum( rvec_spad,
+//                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
+//                    system->n );
+        }
 
-        Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size >>>
+        Cuda_Torsion_Angles_Part2 <<< control->blocks_n, control->block_size_n >>>
                 ( system->d_my_atoms, *(workspace->d_workspace), *(lists[BONDS]),
                   system->N );
         cudaDeviceSynchronize( );
@@ -1552,7 +1695,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
         /* 6. Hydrogen Bonds Interactions */
         if ( control->hbond_cut > 0.0 && system->numH > 0 )
         {
-            cuda_memset( spad, 0, (sizeof(real) + sizeof(rvec)) * 2 * system->n,
+            cuda_memset( spad, 0,
+                    (sizeof(real) + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
                     "Cuda_Compute_Bonded_Forces::spad" );
 
 //            hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
@@ -1567,7 +1711,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                       *(workspace->d_workspace),
                       *(lists[FAR_NBRS]), *(lists[BONDS]), *(lists[HBONDS]),
                       system->n, system->reax_param.num_atom_types,
-                      spad, (rvec *) (&spad[2 * system->n]), system->my_rank );
+                      spad, (rvec *) (&spad[system->n]), system->my_rank );
             cudaDeviceSynchronize( );
             cudaCheckError( );
 
@@ -1579,23 +1723,28 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                         system->n );
             }
 
-            /* reduction for ext_pres */
-            rvec_spad = (rvec *) (&spad[2 * system->n]);
-            k_reduction_rvec <<< control->blocks, control->block_size,
-                             sizeof(rvec) * control->block_size >>>
-                ( rvec_spad, &rvec_spad[system->n], system->n );
-            cudaDeviceSynchronize( );
-            cudaCheckError( );
-
-            k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
-                ( &rvec_spad[system->n],
-                  &((simulation_data *)data->d_simulation_data)->my_ext_press,
-                  control->blocks );
-            cudaDeviceSynchronize( );
-            cudaCheckError( );
-//            Cuda_Reduction_Sum( rvec_spad,
-//                    &((simulation_data *)data->d_simulation_data)->my_ext_press,
-//                    system->n );
+            if ( control->virial == 1 )
+            {
+                rvec_spad = (rvec *) (&spad[system->n]);
+
+                /* reduction for ext_pres */
+                k_reduction_rvec <<< control->blocks, control->block_size,
+                                 sizeof(rvec) * (control->block_size / 32) >>>
+                    ( rvec_spad, &rvec_spad[system->n], system->n );
+                cudaDeviceSynchronize( );
+                cudaCheckError( );
+
+                k_reduction_rvec <<< 1, control->blocks_pow_2,
+                                 sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
+                    ( &rvec_spad[system->n],
+                      &((simulation_data *)data->d_simulation_data)->my_ext_press,
+                      control->blocks );
+                cudaDeviceSynchronize( );
+                cudaCheckError( );
+//                Cuda_Reduction_Sum( rvec_spad,
+//                        &((simulation_data *)data->d_simulation_data)->my_ext_press,
+//                        system->n );
+            }
 
             Cuda_Hydrogen_Bonds_Part2 <<< control->blocks, control->block_size >>>
                 ( system->d_my_atoms, *(workspace->d_workspace),
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index 0cacf504..b46989af 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -198,8 +198,12 @@ void Cuda_Init_Lists( reax_system *system, control_params *control,
 
     Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
 
+    /* first call to Cuda_Estimate_Storages requires setting these manually before allocation */
+    workspace->d_workspace->H.n = system->n;
+    workspace->d_workspace->H.n_max = system->local_cap;
+    workspace->d_workspace->H.format = SYM_FULL_MATRIX;
+
     /* estimate storage for bonds, hbonds, and sparse matrix */
-    workspace->d_workspace->H.n_max = system->local_cap; // first call requires setting this manually before allocation
     Cuda_Estimate_Storages( system, control, workspace, lists,
             TRUE, TRUE, TRUE, data->step - data->prev_steps );
 
diff --git a/PG-PuReMD/src/cuda/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 02cad354..37e0e28b 100644
--- a/PG-PuReMD/src/cuda/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -525,7 +525,7 @@ CUDA_GLOBAL void k_vdW_coulomb_energy_opt( reax_atom *my_atoms,
 
 
 /* one thread per atom implementation */
-CUDA_GLOBAL void k_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, 
+CUDA_GLOBAL void k_vdW_coulomb_energy_tab( reax_atom *my_atoms, 
         global_parameters gp, control_params *control, 
         storage workspace, reax_list far_nbr_list, 
         LR_lookup_table *t_LR, int n, int num_atom_types, 
@@ -693,53 +693,47 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
         storage *workspace, simulation_data *data, reax_list **lists,
         output_controls *out_control )
 {
-//    int blocks;
     int update_energy;
     real *spad;
     rvec *spad_rvec;
 
-//    blocks = (system->n * VDW_KER_THREADS_PER_ATOM / DEF_BLOCK_SIZE) 
-//        + ((system->n * VDW_KER_THREADS_PER_ATOM % DEF_BLOCK_SIZE == 0) ? 0 : 1);
     update_energy = (out_control->energy_update_freq > 0
             && data->step % out_control->energy_update_freq == 0) ? TRUE : FALSE;
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
             (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * control->blocks,
             "Cuda_NonBonded_Energy::workspace->scratch" );
-//    cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-//            (sizeof(real) * 2 + sizeof(rvec)) * system->n + sizeof(rvec) * blocks,
-//            "Cuda_NonBonded_Energy::workspace->scratch" );
     spad = (real *) workspace->scratch;
 
     if ( control->tabulate == 0 )
     {
         k_vdW_coulomb_energy <<< control->blocks, control->block_size >>>
             ( system->d_my_atoms, system->reax_param.d_tbp, 
-              system->reax_param.d_gp, (control_params *)control->d_control_params, 
+              system->reax_param.d_gp, (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
               system->n, system->reax_param.num_atom_types, 
-              spad, &spad[system->n], (rvec *)(&spad[2 * system->n]) );
+              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
 //        k_vdW_coulomb_energy_opt <<< blocks, DEF_BLOCK_SIZE,
 //                             (2 * sizeof(real) + sizeof(rvec)) * DEF_BLOCK_SIZE >>>
 //            ( system->d_my_atoms, system->reax_param.d_tbp, 
-//              system->reax_param.d_gp, (control_params *)control->d_control_params, 
+//              system->reax_param.d_gp, (control_params *) control->d_control_params, 
 //              *(workspace->d_workspace), *(lists[FAR_NBRS]), 
 //              system->n, system->reax_param.num_atom_types, 
-//              spad, &spad[system->n], (rvec *)(&spad[2 * system->n]) );
+//              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]) );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
     else
     {
-        k_tabulated_vdW_coulomb_energy <<< control->blocks, control->block_size >>>
+        k_vdW_coulomb_energy_tab <<< control->blocks, control->block_size >>>
             ( system->d_my_atoms, system->reax_param.d_gp, 
-              (control_params *)control->d_control_params, 
+              (control_params *) control->d_control_params, 
               *(workspace->d_workspace), *(lists[FAR_NBRS]), 
               workspace->d_LR, system->n,
               system->reax_param.num_atom_types, 
               data->step, data->prev_steps, 
               out_control->energy_update_freq,
-              spad, &spad[system->n], (rvec *)(&spad[2 * system->n]));
+              spad, &spad[system->n], (rvec *) (&spad[2 * system->n]));
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -747,28 +741,35 @@ void Cuda_NonBonded_Energy( reax_system *system, control_params *control,
     if ( update_energy == TRUE )
     {
         /* reduction for vdw */
-        Cuda_Reduction_Sum( spad, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
+        Cuda_Reduction_Sum( spad,
+                &((simulation_data *)data->d_simulation_data)->my_en.e_vdW,
                 system->n );
 
         /* reduction for ele */
-        Cuda_Reduction_Sum( &spad[system->n], &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
+        Cuda_Reduction_Sum( &spad[system->n],
+                &((simulation_data *)data->d_simulation_data)->my_en.e_ele,
                 system->n );
     }
 
-    /* reduction for ext_press */
-    spad_rvec = (rvec *) (&spad[2 * system->n]);
-    k_reduction_rvec <<< control->blocks, control->block_size,
-                     sizeof(rvec) * control->block_size >>>
-        ( spad_rvec, &spad_rvec[system->n], system->n );
-    cudaDeviceSynchronize( );
-    cudaCheckError( );
+    if ( control->virial == 1 )
+    {
+        spad_rvec = (rvec *) (&spad[2 * system->n]);
 
-    k_reduction_rvec <<< 1, control->blocks_pow_2,
-                     sizeof(rvec) * control->blocks_pow_2 >>>
-        ( &spad_rvec[system->n],
-          &((simulation_data *)data->d_simulation_data)->my_ext_press, control->blocks );
-    cudaDeviceSynchronize( );
-    cudaCheckError( );
+        /* reduction for ext_press */
+        k_reduction_rvec <<< control->blocks, control->block_size,
+                         sizeof(rvec) * (control->block_size / 32) >>>
+            ( spad_rvec, &spad_rvec[system->n], system->n );
+        cudaDeviceSynchronize( );
+        cudaCheckError( );
+
+        k_reduction_rvec <<< 1, control->blocks_pow_2,
+                         sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
+            ( &spad_rvec[system->n],
+              &((simulation_data *)data->d_simulation_data)->my_ext_press,
+              control->blocks );
+        cudaDeviceSynchronize( );
+        cudaCheckError( );
+    }
 
     if ( update_energy == TRUE )
     {
diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
index 3e44b842..71dce725 100644
--- a/PG-PuReMD/src/cuda/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -198,7 +198,7 @@ CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n )
         rvec_Copy( data, input[i] );
 
         /* warp-level sum using registers within a warp */
-        for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
+        for ( offset = warpSize >> 1; offset > 0; offset >>= 1 )
         {
             data[0] += __shfl_down_sync( mask, data[0], offset );
             data[1] += __shfl_down_sync( mask, data[1], offset );
@@ -249,7 +249,7 @@ CUDA_GLOBAL void k_reduction_rvec2( rvec2 *input, rvec2 *results, size_t n )
         data[1] = input[i][1];
 
         /* warp-level sum using registers within a warp */
-        for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
+        for ( offset = warpSize >> 1; offset > 0; offset >>= 1 )
         {
             data[0] += __shfl_down_sync( mask, data[0], offset );
             data[1] += __shfl_down_sync( mask, data[1], offset );
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index fdb63082..385a163d 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -147,7 +147,7 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start,
         const real * const x, real * const b, int N )
 {
     int i, pj, si, ei;
-    real b_local;
+    real sum;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -158,20 +158,20 @@ CUDA_GLOBAL void k_sparse_matvec_half_csr( int *row_ptr_start,
 
     si = row_ptr_start[i];
     ei = row_ptr_end[i];
-    b_local = 0.0;
+    sum = 0.0;
 
     for ( pj = si; pj < ei; ++pj )
     {
-        b_local += vals[pj] * x[col_ind[pj]];
+        sum += vals[pj] * x[col_ind[pj]];
         /* symmetric contribution (A is symmetric, lower half stored) */
         atomicAdd( (double*) &b[col_ind[pj]], (double) (vals[pj] * x[i]) );
     }
 
     /* diagonal entry */
-    b_local += vals[ei] * x[i];
+    sum += vals[ei] * x[i];
 
     /* local contribution to row i for this thread */
-    atomicAdd( (double*) &b[i], (double) b_local );
+    atomicAdd( (double*) &b[i], (double) sum );
 }
 
 
@@ -225,42 +225,89 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start,
         int *row_ptr_end, int *col_ind, real *vals,
         const real * const x, real * const b, int n )
 {
-    int i, pj, thread_id, warp_id, lane_id, offset;
+    int pj, thread_id, warp_id, lane_id, offset;
     int si, ei;
-    unsigned int mask;
     real sum;
 
     thread_id = blockDim.x * blockIdx.x + threadIdx.x;
     warp_id = thread_id >> 5;
+
+    if ( warp_id >= n )
+    {
+        return;
+    }
+
     lane_id = thread_id & 0x0000001F; 
-    mask = __ballot_sync( FULL_MASK, warp_id < n );
+    si = row_ptr_start[warp_id];
+    ei = row_ptr_end[warp_id];
+    sum = 0.0;
 
-    if ( warp_id < n )
+    /* partial sums per thread */
+    for ( pj = si + lane_id; pj < ei; pj += warpSize )
     {
-        i = warp_id;
-        si = row_ptr_start[i];
-        ei = row_ptr_end[i];
-        sum = 0.0;
+        sum += vals[pj] * x[col_ind[pj]];
+    }
 
-        /* partial sums per thread */
-        for ( pj = si + lane_id; pj < ei; pj += warpSize )
-        {
-            sum += vals[pj] * x[ col_ind[pj] ];
-        }
+    /* warp-level reduction of partial sums
+     * using registers within a warp */
+    for ( offset = warpSize >> 1; offset > 0; offset >>= 1 )
+    {
+        sum += __shfl_down_sync( FULL_MASK, sum, offset );
+    }
 
-        /* warp-level reduction of partial sums
-         * using registers within a warp */
-        for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
-        {
-            sum += __shfl_down_sync( mask, sum, offset );
-        }
+    __syncthreads( );
 
-        /* first thread within a warp writes sum to global memory */
-        if ( lane_id == 0 )
-        {
-            b[i] = sum;
-        }
+    /* first thread within a warp writes sum to global memory */
+    if ( lane_id == 0 )
+    {
+        b[warp_id] = sum;
+    }
+}
+
+
+/* sparse matrix, dense vector multiplication Ax = b,
+ * where one GPU thread multiplies a row
+ *
+ * A: symmetric (lower triangular portion only stored), square matrix,
+ *    stored in CSR format
+ * X: 2 dense vectors, size equal to num. columns in A
+ * B (output): 2 dense vectors, size equal to num. columns in A
+ * N: number of rows in A */
+CUDA_GLOBAL void k_dual_sparse_matvec_half_csr( int *row_ptr_start,
+        int *row_ptr_end, int *col_ind, real *vals,
+        const rvec2 * const x, rvec2 * const b, int N )
+{
+    int i, pj, si, ei;
+    rvec2 sum;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= N )
+    {
+        return;
+    }
+
+    si = row_ptr_start[i];
+    ei = row_ptr_end[i];
+    sum[0] = 0.0;
+    sum[1] = 0.0;
+
+    for ( pj = si; pj < ei; ++pj )
+    {
+        sum[0] += vals[pj] * x[col_ind[pj]][0];
+        sum[1] += vals[pj] * x[col_ind[pj]][1];
+        /* symmetric contribution (A is symmetric, lower half stored) */
+        atomicAdd( (double*) &b[col_ind[pj]][0], (double) (vals[pj] * x[i][0]) );
+        atomicAdd( (double*) &b[col_ind[pj]][1], (double) (vals[pj] * x[i][1]) );
     }
+
+    /* diagonal entry */
+    sum[0] += vals[ei] * x[i][0];
+    sum[1] += vals[ei] * x[i][1];
+
+    /* local contribution to row i for this thread */
+    atomicAdd( (double*) &b[i][0], (double) sum[0] );
+    atomicAdd( (double*) &b[i][1], (double) sum[1] );
 }
 
 
@@ -285,8 +332,8 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
 
     for ( pj = si; pj < ei; ++pj )
     {
-        sum[0] += A.val[pj] * x[ A.j[pj] ][0];
-        sum[1] += A.val[pj] * x[ A.j[pj] ][1];
+        sum[0] += A.val[pj] * x[A.j[pj]][0];
+        sum[1] += A.val[pj] * x[A.j[pj]][1];
     }
 
     b[i][0] = sum[0];
@@ -302,50 +349,50 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
  *    stored in CSR format
  * X: 2 dense vectors, size equal to num. columns in A
  * B (output): 2 dense vectors, size equal to num. columns in A
- * N: number of rows in A */
+ * n: number of rows in A */
 CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start,
         int *row_ptr_end, int *col_ind, real *vals,
         rvec2 const * const x, rvec2 * const b, int n )
 {
-    int i, pj, thread_id, warp_id, lane_id, offset;
-    int si, ei;
-    unsigned int mask;
+    int pj, si, ei, thread_id, warp_id, lane_id, offset;
     rvec2 sum;
 
     thread_id = blockDim.x * blockIdx.x + threadIdx.x;
     warp_id = thread_id >> 5;
+
+    if ( warp_id >= n )
+    {
+        return;
+    }
+
     lane_id = thread_id & 0x0000001F; 
-    mask = __ballot_sync( FULL_MASK, warp_id < n );
+    si = row_ptr_start[warp_id];
+    ei = row_ptr_end[warp_id];
+    sum[0] = 0.0;
+    sum[1] = 0.0;
 
-    if ( warp_id < n )
+    /* partial sums per thread */
+    for ( pj = si + lane_id; pj < ei; pj += warpSize )
     {
-        i = warp_id;
-        si = row_ptr_start[i];
-        ei = row_ptr_end[i];
-        sum[0] = 0.0;
-        sum[1] = 0.0;
-
-        /* partial sums per thread */
-        for ( pj = si + lane_id; pj < ei; pj += warpSize )
-        {
-            sum[0] += vals[pj] * x[ col_ind[pj] ][0];
-            sum[1] += vals[pj] * x[ col_ind[pj] ][1];
-        }
+        sum[0] += vals[pj] * x[col_ind[pj]][0];
+        sum[1] += vals[pj] * x[col_ind[pj]][1];
+    }
 
-        /* warp-level reduction of partial sums
-         * using registers within a warp */
-        for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
-        {
-            sum[0] += __shfl_down_sync( mask, sum[0], offset );
-            sum[1] += __shfl_down_sync( mask, sum[1], offset );
-        }
+    /* warp-level reduction of partial sums
+     * using registers within a warp */
+    for ( offset = warpSize >> 1; offset > 0; offset >>= 1 )
+    {
+        sum[0] += __shfl_down_sync( FULL_MASK, sum[0], offset );
+        sum[1] += __shfl_down_sync( FULL_MASK, sum[1], offset );
+    }
 
-        /* first thread within a warp writes sum to global memory */
-        if ( lane_id == 0 )
-        {
-            b[i][0] = sum[0];
-            b[i][1] = sum[1];
-        }
+    __syncthreads( );
+
+    /* first thread within a warp writes sum to global memory */
+    if ( lane_id == 0 )
+    {
+        b[warp_id][0] = sum[0];
+        b[warp_id][1] = sum[1];
     }
 }
 
@@ -408,6 +455,7 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system,
             sizeof(rvec2) * n, TRUE, SAFE_ZONE,
             "Dual_Sparse_MatVec_Comm_Part1::workspace->host_scratch" );
     spad = (rvec2 *) workspace->host_scratch;
+
     copy_host_device( spad, (void *) x, sizeof(rvec2) * n,
             cudaMemcpyDeviceToHost, "Dual_Sparse_MatVec_Comm_Part1::x" );
 
@@ -439,18 +487,18 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         /* half-format requires entries of b be initialized to zero */
         cuda_memset( b, 0, sizeof(rvec2) * n, "Dual_Sparse_MatVec_local::b" );
 
-        //TODO: implement half-format dual SpMV
-//        blocks = n * 32 / DEF_BLOCK_SIZE + 
-//            (n * 32 % DEF_BLOCK_SIZE == 0 ? 0 : 1);
-
         /* 1 thread per row implementation */
-//        k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
-//            ( A->start, A->end, A->j, A->val, x, b, n );
+        k_dual_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
+            ( A->start, A->end, A->j, A->val, x, b, A->n );
 
+        //TODO: implement half-format dual SpMV
+//        blocks = (A->n * 32) / DEF_BLOCK_SIZE
+//            + ((A->n * 32) % DEF_BLOCK_SIZE == 0 ? 0 : 1);
+        
         /* multiple threads per row implementation,
          * with shared memory to accumulate partial row sums */
 //        k_dual_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
-//             ( A->start, A->end, A->j, A->val, x, b, n );
+//             ( A->start, A->end, A->j, A->val, x, b, A->n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -458,15 +506,15 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
     {
         /* 1 thread per row implementation */
 //        k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>>
-//             ( *A, x, b, n );
+//             ( *A, x, b, A->n );
 
-        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+        blocks = ((A->n * 32) / DEF_BLOCK_SIZE)
+            + (((A->n * 32) % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
         
-        /* multiple threads per row implementation
+        /* 32 threads per row implementation
          * using registers to accumulate partial row sums */
         k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
-                ( A->start, A->end, A->j, A->val, x, b, n );
+                ( A->start, A->end, A->j, A->val, x, b, A->n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -498,7 +546,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system,
     rvec2 *spad;
 
     /* reduction required for symmetric half matrix */
-//    if ( mat_format == SYM_HALF_MATRIX )
+    if ( mat_format == SYM_HALF_MATRIX )
     {
 //#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 //        Coll( system, mpi_data, b, buf_type, mpi_type );
@@ -619,20 +667,20 @@ static void Sparse_MatVec_local( control_params const * const control,
 
     if ( A->format == SYM_HALF_MATRIX )
     {
-//        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-//            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
         /* half-format requires entries of b be initialized to zero */
         cuda_memset( b, 0, sizeof(real) * n, "Sparse_MatVec_local::b" );
 
         /* 1 thread per row implementation */
         k_sparse_matvec_half_csr <<< control->blocks, control->block_size >>>
-            ( A->start, A->end, A->j, A->val, x, b, n );
+            ( A->start, A->end, A->j, A->val, x, b, A->n );
+
+//        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+//            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
         /* multiple threads per row implementation,
          * with shared memory to accumulate partial row sums */
 //        k_sparse_matvec_half_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
-//             ( A->start, A->end, A->j, A->val, x, b, n );
+//             ( A->start, A->end, A->j, A->val, x, b, A->n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -640,15 +688,15 @@ static void Sparse_MatVec_local( control_params const * const control,
     {
         /* 1 thread per row implementation */
 //        k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>>
-//             ( A->start, A->end, A->j, A->val, x, b, n );
+//             ( A->start, A->end, A->j, A->val, x, b, A->n );
 
-        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+        blocks = ((A->n * 32) / DEF_BLOCK_SIZE)
+            + (((A->n * 32) % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
         /* multiple threads per row implementation
          * using registers to accumulate partial row sums */
         k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
-             ( A->start, A->end, A->j, A->val, x, b, n );
+             ( A->start, A->end, A->j, A->val, x, b, A->n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -678,7 +726,7 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
     real *spad;
 
     /* reduction required for symmetric half matrix */
-//    if ( mat_format == SYM_HALF_MATRIX )
+    if ( mat_format == SYM_HALF_MATRIX )
     {
 //#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 //        Coll( system, mpi_data, b, buf_type, mpi_type );
@@ -761,8 +809,6 @@ int Cuda_dual_CG( reax_system const * const system,
     time = Get_Time( );
 #endif
 
-    matvecs = 0;
-
     Dual_Sparse_MatVec( system, control, data, workspace, mpi_data,
             H, x, system->N, workspace->d_workspace->q2 );
 
@@ -907,6 +953,10 @@ int Cuda_dual_CG( reax_system const * const system,
         Vector_Copy_To_rvec2( workspace->d_workspace->x,
                 workspace->d_workspace->s, 0, system->n );
     }
+    else
+    {
+        matvecs = 0;
+    }
 
     if ( i >= control->cm_solver_max_iters )
     {
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
index cf3337fa..e13bb8b9 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.cu
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -176,39 +176,29 @@ CUDA_GLOBAL void k_center_of_mass_blocks_amcm( single_body_parameters *sbp,
 
 CUDA_GLOBAL void k_compute_inertial_tensor_blocks( real *input, real *output, size_t n )
 {
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-    unsigned int i, index, xx_i, xy_i, xz_i, yy_i, yz_i, zz_i;
+    extern __shared__ real t_s[];
+    unsigned int i, index;
     int offset;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    xx_i = threadIdx.x;
-    xy_i = blockDim.x;
-    xz_i = 2 * blockDim.x;
-    yy_i = 3 * blockDim.x;
-    yz_i = 4 * blockDim.x;
-    zz_i = 5 * blockDim.x;
-
-    xx[xx_i] = 0.0;
-    xy[xy_i + threadIdx.x] = 0.0;
-    xz[xz_i + threadIdx.x] = 0.0;
-    yy[yy_i + threadIdx.x] = 0.0;
-    yz[yz_i + threadIdx.x] = 0.0;
-    zz[zz_i + threadIdx.x] = 0.0;
-
     if ( i < n )
     {
-        xx[ xx_i ] = input[ threadIdx.x * 6 ];
-        xy[ xy_i + threadIdx.x ] = input[ threadIdx.x * 6 + 1 ];
-        xz[ xz_i + threadIdx.x ] = input[ threadIdx.x * 6 + 2 ];
-        yy[ yy_i + threadIdx.x ] = input[ threadIdx.x * 6 + 3 ];
-        yz[ yz_i + threadIdx.x ] = input[ threadIdx.x * 6 + 4 ];
-        zz[ zz_i + threadIdx.x ] = input[ threadIdx.x * 6 + 5 ];
+        t_s[ 6 * i ] = input[ i * 6 ];
+        t_s[ 6 * i + 1 ] = input[ i * 6 + 1 ];
+        t_s[ 6 * i + 2 ] = input[ i * 6 + 2 ];
+        t_s[ 6 * i + 3 ] = input[ i * 6 + 3 ];
+        t_s[ 6 * i + 4 ] = input[ i * 6 + 4 ];
+        t_s[ 6 * i + 5 ] = input[ i * 6 + 5 ];
+    }
+    else
+    {
+        t_s[ 6 * i ] = 0.0;
+        t_s[ 6 * i + 1 ] = 0.0;
+        t_s[ 6 * i + 2 ] = 0.0;
+        t_s[ 6 * i + 3 ] = 0.0;
+        t_s[ 6 * i + 4 ] = 0.0;
+        t_s[ 6 * i + 5 ] = 0.0;
     }
     __syncthreads( );
 
@@ -216,25 +206,25 @@ CUDA_GLOBAL void k_compute_inertial_tensor_blocks( real *input, real *output, si
     {
         if ( threadIdx.x < offset )
         {
-            index = threadIdx.x + offset;
-            xx[ threadIdx.x ] += xx[ index ];
-            xy[ xy_i + threadIdx.x ] += xy[ xy_i + index ];
-            xz[ xz_i + threadIdx.x ] += xz[ xz_i + index ];
-            yy[ yy_i + threadIdx.x ] += yy[ yy_i + index ];
-            yz[ yz_i + threadIdx.x ] += yz[ yz_i + index ];
-            zz[ zz_i + threadIdx.x ] += zz[ zz_i + index ];
+            index = 6 * (threadIdx.x + offset);
+            t_s[ 6 * threadIdx.x ] += t_s[ index ];
+            t_s[ 6 * threadIdx.x + 1 ] += t_s[ index + 1 ];
+            t_s[ 6 * threadIdx.x + 2 ] += t_s[ index + 2 ];
+            t_s[ 6 * threadIdx.x + 3 ] += t_s[ index + 3 ];
+            t_s[ 6 * threadIdx.x + 4 ] += t_s[ index + 4 ];
+            t_s[ 6 * threadIdx.x + 5 ] += t_s[ index + 5 ];
         }
         __syncthreads( );
     }
 
     if ( threadIdx.x == 0 )
     {
-        output[0] = xx[0];
-        output[1] = xy[xy_i];
-        output[2] = xz[xz_i];
-        output[3] = xz[yy_i];
-        output[4] = xz[yz_i];
-        output[5] = xz[zz_i];
+        output[0] = t_s[0];
+        output[1] = t_s[1];
+        output[2] = t_s[2];
+        output[3] = t_s[3];
+        output[4] = t_s[4];
+        output[5] = t_s[5];
     }
 }
 
@@ -242,16 +232,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_blocks( real *input, real *output, si
 CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp,
         reax_atom *atoms, real *t_g, real xcm0, real xcm1, real xcm2, size_t n )
 {
-    extern __shared__ real xx_s[];
-    extern __shared__ real xy_s[];
-    unsigned int xy_i, i, index, mask;
+    extern __shared__ real xx_xy_s[];
+    unsigned int i, index, mask;
     int offset;
     real xx, xy, m;
     rvec diff, xcm;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
     mask = __ballot_sync( FULL_MASK, i < n );
-    xy_i = blockDim.x;
     xcm[0] = xcm0;
     xcm[1] = xcm1;
     xcm[2] = xcm2;
@@ -273,8 +261,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp,
         /* first thread within a warp writes warp-level sum to shared memory */
         if ( threadIdx.x % warpSize == 0 )
         {
-            xx_s[threadIdx.x >> 5] = xx;    
-            xy_s[threadIdx.x >> 5] = xy;    
+            xx_xy_s[2 * (threadIdx.x >> 5)] = xx;    
+            xx_xy_s[2 * (threadIdx.x >> 5) + 1] = xy;    
         }
     }
     __syncthreads( );
@@ -284,9 +272,9 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp,
     {
         if ( threadIdx.x < offset )
         {
-            index = threadIdx.x + offset;
-            xx_s[ threadIdx.x ] += xx_s[ index ];
-            xy_s[ xy_i + threadIdx.x ] += xy_s[ xy_i + index ];
+            index = 2 * (threadIdx.x + offset);
+            xx_xy_s[ 2 * threadIdx.x ] += xx_xy_s[ index ];
+            xx_xy_s[ 2 * threadIdx.x + 1 ] += xx_xy_s[ index + 1 ];
         }
         __syncthreads( );
     }
@@ -295,8 +283,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp,
      * of the reduction back to global memory */
     if ( threadIdx.x == 0 )
     {
-        t_g[ blockIdx.x * 6 ] = xx_s[ 0 ];
-        t_g[ blockIdx.x * 6 + 1 ] = xy_s[ xy_i ];
+        t_g[ blockIdx.x * 6 ] = xx_xy_s[ 0 ];
+        t_g[ blockIdx.x * 6 + 1 ] = xx_xy_s[ 1 ];
     }
 }
 
@@ -304,16 +292,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xx_xy( single_body_parameters *sbp,
 CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp,
         reax_atom *atoms, real *t_g, real xcm0, real xcm1, real xcm2, size_t n )
 {
-    extern __shared__ real xz_s[];
-    extern __shared__ real yy_s[];
-    unsigned int yy_i, i, index, mask;
+    extern __shared__ real xz_yy_s[];
+    unsigned int i, index, mask;
     int offset;
     real xz, yy, m;
     rvec diff, xcm;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
     mask = __ballot_sync( FULL_MASK, i < n );
-    yy_i = blockDim.x;
     xcm[0] = xcm0;
     xcm[1] = xcm1;
     xcm[2] = xcm2;
@@ -335,8 +321,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp,
         /* first thread within a warp writes warp-level sum to shared memory */
         if ( threadIdx.x % warpSize == 0 )
         {
-            xz_s[threadIdx.x >> 5] = xz;    
-            yy_s[threadIdx.x >> 5] = yy;    
+            xz_yy_s[2 * (threadIdx.x >> 5)] = xz;    
+            xz_yy_s[2 * (threadIdx.x >> 5) + 1] = yy;    
         }
     }
     __syncthreads( );
@@ -346,9 +332,9 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp,
     {
         if ( threadIdx.x < offset )
         {
-            index = threadIdx.x + offset;
-            xz_s[ threadIdx.x ] += xz_s[ index ];
-            yy_s[ yy_i + threadIdx.x ] += yy_s[ yy_i + index ];
+            index = 2 * (threadIdx.x + offset);
+            xz_yy_s[ 2 * threadIdx.x ] += xz_yy_s[ index ];
+            xz_yy_s[ 2 * threadIdx.x + 1 ] += xz_yy_s[ index + 1 ];
         }
         __syncthreads( );
     }
@@ -357,8 +343,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp,
      * of the reduction back to global memory */
     if ( threadIdx.x == 0 )
     {
-        t_g[ blockIdx.x * 6 + 2 ] = xz_s[ 0 ];
-        t_g[ blockIdx.x * 6 + 3 ] = yy_s[ yy_i ];
+        t_g[ blockIdx.x * 6 + 2 ] = xz_yy_s[ 0 ];
+        t_g[ blockIdx.x * 6 + 3 ] = xz_yy_s[ 1 ];
     }
 }
 
@@ -366,16 +352,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_xz_yy( single_body_parameters *sbp,
 CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp,
         reax_atom *atoms, real *t_g, real xcm0, real xcm1, real xcm2, size_t n )
 {
-    extern __shared__ real yz_s[];
-    extern __shared__ real zz_s[];
-    unsigned int i, zz_i, index, mask;
+    extern __shared__ real yz_zz_s[];
+    unsigned int i, index, mask;
     int offset;
     real yz, zz, m;
     rvec diff, xcm;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
     mask = __ballot_sync( FULL_MASK, i < n );
-    zz_i = blockDim.x;
     xcm[0] = xcm0;
     xcm[1] = xcm1;
     xcm[2] = xcm2;
@@ -397,8 +381,8 @@ CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp,
         /* first thread within a warp writes warp-level sum to shared memory */
         if ( threadIdx.x % warpSize == 0 )
         {
-            yz_s[threadIdx.x >> 5] = yz;    
-            zz_s[threadIdx.x >> 5] = zz;    
+            yz_zz_s[2 * (threadIdx.x >> 5)] = yz;    
+            yz_zz_s[2 * (threadIdx.x >> 5)] = zz;    
         }
     }
     __syncthreads( );
@@ -408,9 +392,9 @@ CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp,
     {
         if ( threadIdx.x < offset )
         {
-            index = threadIdx.x + offset;
-            yz_s[ threadIdx.x ] += yz_s[ index ];
-            zz_s[ zz_i + threadIdx.x ] += zz_s[ zz_i + index ];
+            index = 2 * (threadIdx.x + offset);
+            yz_zz_s[ 2 * threadIdx.x ] += yz_zz_s[ index ];
+            yz_zz_s[ 2 * threadIdx.x + 1 ] += yz_zz_s[ index + 1 ];
         }
         __syncthreads( );
     }
@@ -419,14 +403,14 @@ CUDA_GLOBAL void k_compute_inertial_tensor_yz_zz( single_body_parameters *sbp,
      * of the reduction back to global memory */
     if ( threadIdx.x == 0 )
     {
-        t_g[ blockIdx.x * 6 + 4 ] = yz_s[ 0 ];
-        t_g[ blockIdx.x * 6 + 5 ] = zz_s[ zz_i ];
+        t_g[ blockIdx.x * 6 + 4 ] = yz_zz_s[ 0 ];
+        t_g[ blockIdx.x * 6 + 5 ] = yz_zz_s[ 1 ];
     }
 }
 
 
 CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n )
+        real *results, int n )
 {
     extern __shared__ real M_s[];
     unsigned int i, mask;
@@ -464,13 +448,13 @@ CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *m
 
     if ( threadIdx.x == 0 )
     {
-        block_results[blockIdx.x] = M_s[0];
+        results[blockIdx.x] = M_s[0];
     }
 }
 
 
 CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n )
+        real *results, int n )
 {
     extern __shared__ real e_kin_s[];
     unsigned int i, mask;
@@ -515,7 +499,7 @@ CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_ato
      * of the reduction back to global memory */
     if ( threadIdx.x == 0 )
     {
-        block_results[blockIdx.x] = e_kin_s[0];
+        results[blockIdx.x] = e_kin_s[0];
     }
 }
 
@@ -581,63 +565,66 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
     rvec *spad;
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(rvec) * (control->blocks_pow_2 + 1),
+            sizeof(rvec) * (control->blocks + 1),
             "Cuda_Compute_Momentum::workspace->scratch" );
     spad = (rvec *) workspace->scratch;
 
     // xcm
-    cuda_memset( spad, 0, sizeof(rvec) * (control->blocks_pow_2 + 1),
-            "Cuda_Compute_Momentum::tmp" );
+    cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1),
+            "Cuda_Compute_Momentum::spad" );
     
     k_center_of_mass_blocks_xcm <<< control->blocks, control->block_size,
-                                sizeof(rvec) * control->block_size >>>
+                                sizeof(rvec) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
     
-    k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
-            ( spad, &spad[control->blocks_pow_2], control->blocks );
+    k_reduction_rvec <<< 1, control->blocks_pow_2,
+                     sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
+            ( spad, &spad[control->blocks], control->blocks );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    copy_host_device( xcm, &spad[control->blocks_pow_2],
-            sizeof(rvec), cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::xcm" );
+    copy_host_device( xcm, &spad[control->blocks], sizeof(rvec),
+            cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::xcm" );
     
     // vcm
-    cuda_memset( spad, 0, sizeof(rvec) * (control->blocks_pow_2 + 1),
-            "Cuda_Compute_Momentum::tmp" );
+    cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1),
+            "Cuda_Compute_Momentum::spad" );
     
     k_center_of_mass_blocks_vcm <<< control->blocks, control->block_size,
-                                sizeof(rvec) * control->block_size >>>
+                                sizeof(rvec) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
     
-    k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
-        ( spad, &spad[control->blocks_pow_2], control->blocks );
+    k_reduction_rvec <<< 1, control->blocks_pow_2,
+                     sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
+        ( spad, &spad[control->blocks], control->blocks );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    copy_host_device( vcm, &spad[control->blocks_pow_2], sizeof(rvec),
+    copy_host_device( vcm, &spad[control->blocks], sizeof(rvec),
         cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::vcm" );
     
     // amcm
-    cuda_memset( spad, 0,  sizeof (rvec) * (control->blocks_pow_2 + 1),
-            "Cuda_Compute_Momentum::tmp");
+    cuda_memset( spad, 0,  sizeof(rvec) * (control->blocks + 1),
+            "Cuda_Compute_Momentum::spad");
     
     k_center_of_mass_blocks_amcm <<< control->blocks, control->block_size,
-                                 sizeof(rvec) * control->block_size >>>
+                                 sizeof(rvec) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad, system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
     
-    k_reduction_rvec <<< 1, control->blocks_pow_2, sizeof(rvec) * control->blocks_pow_2 >>>
-        ( spad, &spad[control->blocks_pow_2], control->blocks );
+    k_reduction_rvec <<< 1, control->blocks_pow_2,
+                     sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
+        ( spad, &spad[control->blocks], control->blocks );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    copy_host_device( amcm, &spad[control->blocks_pow_2], sizeof(rvec),
-        cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::amcm" );
+    copy_host_device( amcm, &spad[control->blocks], sizeof(rvec),
+        cudaMemcpyDeviceToHost,"Cuda_Compute_Momentum::amcm" );
 }
 
 
@@ -647,28 +634,28 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c
     real *spad;
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(real) * 6 * (control->blocks_pow_2 + 1),
+            sizeof(real) * 6 * (control->blocks + 1),
             "Cuda_Compute_Inertial_Tensor::workspace->scratch" );
     spad = (real *) workspace->scratch;
-    cuda_memset( spad, 0, sizeof(real) * 6 * (control->blocks_pow_2 + 1),
+    cuda_memset( spad, 0, sizeof(real) * 6 * (control->blocks + 1),
             "Cuda_Compute_Intertial_Tensor::tmp" );
 
     k_compute_inertial_tensor_xx_xy <<< control->blocks, control->block_size,
-                                sizeof(real) * 2 * control->block_size >>>
+                                sizeof(real) * 2 * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad,
           my_xcm[0], my_xcm[1], my_xcm[2], system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
     k_compute_inertial_tensor_xz_yy <<< control->blocks, control->block_size,
-                                sizeof(real) * 2 * control->block_size >>>
+                                sizeof(real) * 2 * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad,
           my_xcm[0], my_xcm[1], my_xcm[2], system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
     k_compute_inertial_tensor_yz_zz <<< control->blocks, control->block_size,
-                                sizeof(real) * 2 * control->block_size >>>
+                                sizeof(real) * 2 * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad,
           my_xcm[0], my_xcm[1], my_xcm[2], system->n );
     cudaDeviceSynchronize( );
@@ -677,11 +664,11 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c
     /* reduction of block-level partial sums for inertial tensor */
     k_compute_inertial_tensor_blocks <<< 1, control->blocks_pow_2,
                               sizeof(real) * 6 * control->blocks_pow_2 >>>
-        ( spad, &spad[control->blocks_pow_2 * 6], control->blocks );
+        ( spad, &spad[6 * control->blocks], control->blocks );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
-    copy_host_device( t, &spad[6 * control->blocks_pow_2],
+    copy_host_device( t, &spad[6 * control->blocks],
         sizeof(real) * 6, cudaMemcpyDeviceToHost,
         "Cuda_Compute_Intertial_Tensor::t" );
 }
@@ -712,26 +699,27 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system,
     real *block_energy;
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(real) * (control->blocks_pow_2 + 1),
+            sizeof(real) * (control->blocks + 1),
             "Cuda_Compute_Kinetic_Energy::workspace->scratch" );
     block_energy = (real *) workspace->scratch;
-    cuda_memset( block_energy, 0, sizeof(real) * (control->blocks_pow_2 + 1),
+    cuda_memset( block_energy, 0, sizeof(real) * (control->blocks + 1),
             "Cuda_Compute_Kinetic_Energy::tmp" );
 
     data->my_en.e_kin = 0.0;
 
     k_compute_kinetic_energy <<< control->blocks, control->block_size,
-                             sizeof(real) * control->block_size >>>
+                             sizeof(real) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* note: above kernel sums the kinetic energy contribution within blocks,
      * and this call finishes the global reduction across all blocks */
-    Cuda_Reduction_Sum( block_energy, &block_energy[control->blocks_pow_2], control->blocks_pow_2 );
+    Cuda_Reduction_Sum( block_energy, &block_energy[control->blocks], control->blocks );
 
-    copy_host_device( &data->my_en.e_kin, &block_energy[control->blocks_pow_2],
-            sizeof(real), cudaMemcpyDeviceToHost, "Cuda_Compute_Kinetic_Energy::tmp" );
+    copy_host_device( &data->my_en.e_kin, &block_energy[control->blocks],
+            sizeof(real), cudaMemcpyDeviceToHost,
+            "Cuda_Compute_Kinetic_Energy::tmp" );
 
     ret = MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin,
             1, MPI_DOUBLE, MPI_SUM, comm );
@@ -754,24 +742,23 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control,
     real M_l, *spad_real;
 
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
-            sizeof(real) * (control->blocks_pow_2 + 1),
+            sizeof(real) * (control->blocks + 1),
             "Cuda_Compute_Total_Mass::workspace->scratch" );
     spad_real = (real *) workspace->scratch;
-    cuda_memset( spad_real, 0, sizeof(real) * (control->blocks_pow_2 + 1),
+    cuda_memset( spad_real, 0, sizeof(real) * (control->blocks + 1),
             "Cuda_Compute_Total_Mass::spad_real" );
 
     k_compute_total_mass <<< control->blocks, control->block_size,
-                         sizeof(real) * control->block_size >>>
+                         sizeof(real) * (control->block_size / 32) >>>
         ( system->reax_param.d_sbp, system->d_my_atoms, spad_real, system->n );
     cudaDeviceSynchronize( );
     cudaCheckError( );
 
     /* note: above kernel sums the mass contribution within blocks,
      * and this call finishes the global reduction across all blocks */
-    Cuda_Reduction_Sum( spad_real, &spad_real[control->blocks_pow_2],
-            control->blocks_pow_2 );
+    Cuda_Reduction_Sum( spad_real, &spad_real[control->blocks], control->blocks );
 
-    copy_host_device( &M_l, &spad_real[control->blocks_pow_2], sizeof(real), 
+    copy_host_device( &M_l, &spad_real[control->blocks], sizeof(real), 
             cudaMemcpyDeviceToHost, "total_mass:M_l" );
 
     ret = MPI_Allreduce( &M_l, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
@@ -916,20 +903,21 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control,
               system->n );
 
         k_reduction_rvec <<< control->blocks, control->block_size,
-                         sizeof(rvec) * control->block_size >>>
+                         sizeof(rvec) * (control->block_size / 32) >>>
             ( rvec_spad, &rvec_spad[system->n],  system->n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
 
         k_reduction_rvec <<< 1, control->blocks_pow_2,
-                         sizeof(rvec) * control->blocks_pow_2 >>>
+                         sizeof(rvec) * (control->blocks_pow_2 / 32) >>>
             ( &rvec_spad[system->n], &rvec_spad[system->n + control->blocks],
               control->blocks );
         cudaDeviceSynchronize( );
         cudaCheckError( );
 
-        copy_host_device( &int_press, &rvec_spad[system->n + control->blocks], sizeof(rvec), 
-                cudaMemcpyDeviceToHost, "Cuda_Compute_Pressure::d_int_press" );
+        copy_host_device( &int_press, &rvec_spad[system->n + control->blocks],
+                sizeof(rvec), cudaMemcpyDeviceToHost,
+                "Cuda_Compute_Pressure::int_press" );
     }
 
     /* sum up internal and external pressure */
diff --git a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index f7fdca09..e7911945 100644
--- a/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -454,7 +454,7 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
                                     atomic_rvecAdd( pbond_ij->ta_f, force );
                                     rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data_ext_press [j], ext_press );
+                                    rvec_Add( data_ext_press[j], ext_press );
 
                                     rvec_ScaledAdd( workspace.f[j], 
                                             CEtors7 + CEconj4, p_ijk->dcos_dj );
@@ -471,18 +471,18 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
                                     atomic_rvecAdd( pbond_jk->ta_f, force );
                                     rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press [j], ext_press );
+                                    rvec_Add( data_ext_press[j], ext_press );
 
                                     rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
                                     rvec_Add( pbond_kl->ta_f, force );
                                     rvec_iMultiply( ext_press, rel_box_jl, force );
-                                    rvec_Add( data_ext_press [j], ext_press );
+                                    rvec_Add( data_ext_press[j], ext_press );
 
                                     /* dcos_omega */                      
                                     rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
                                     atomic_rvecAdd( pbond_ij->ta_f, force );
                                     rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data_ext_press [j], ext_press );
+                                    rvec_Add( data_ext_press[j], ext_press );
 
                                     rvec_ScaledAdd( workspace.f[j], 
                                             CEtors9 + CEconj6, dcos_omega_dj );
@@ -490,12 +490,12 @@ CUDA_GLOBAL void Cuda_Torsion_Angles_Part1( reax_atom *my_atoms, global_paramete
                                     rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
                                     rvec_Add( pbond_jk->ta_f, force );
                                     rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data_ext_press [j], ext_press );
+                                    rvec_Add( data_ext_press[j], ext_press );
 
                                     rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
                                     rvec_Add( pbond_kl->ta_f, force );
                                     rvec_iMultiply( ext_press, rel_box_jl, force );
-                                    rvec_Add( data_ext_press [j], ext_press );
+                                    rvec_Add( data_ext_press[j], ext_press );
                                 }
 
 #if defined(TEST_ENERGY)
-- 
GitLab