diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index 6d6f5aec22d3fe984d80863de630b4cf796f1eab..43ea11bf25b6a09a9da27b9c3c4dea991b18c0aa 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -803,7 +803,10 @@ CUDA_GLOBAL void k_estimate_storages_cm_half( reax_atom *my_atoms,
     __syncthreads( );
 
     cm_entries[i] = num_cm_entries;
-    max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
+    /* round up to the nearest multiple of 32 to ensure that reads along
+     * rows can be coalesced for 1 warp per row SpMV implementation */
+    max_cm_entries[i] = MAX( ((int) CEIL( num_cm_entries * SAFE_ZONE )
+                + 0x0000001F) & 0x111111E0, MIN_CM_ENTRIES );
 }
 
 
@@ -844,7 +847,10 @@ CUDA_GLOBAL void k_estimate_storages_cm_full( control_params *control,
     __syncthreads( );
 
     cm_entries[i] = num_cm_entries;
-    max_cm_entries[i] = MAX( (int) CEIL(num_cm_entries * SAFE_ZONE), MIN_CM_ENTRIES );
+    /* round up to the nearest multiple of 32 to ensure that reads along
+     * rows can be coalesced for 1 warp per row SpMV implementation */
+    max_cm_entries[i] = MAX( ((int) CEIL( num_cm_entries * SAFE_ZONE )
+                + 0x0000001F) & 0x111111E0, MIN_CM_ENTRIES );
 }
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 3a6547ffba774f70455aa254cef232957bbe238c..f7ed486b5b9ce16d337db474033abbc290bd43ff 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -187,8 +187,8 @@ CUDA_GLOBAL void k_sparse_matvec_half_opt_csr( int *row_ptr_start,
         int *row_ptr_end, int *col_ind, real *vals,
         const real * const x, real * const b, int N )
 {
-    int pj, si, ei, thread_id, warp_id, lane_id, offset;
-    real sum;
+    int pj, si, ei, thread_id, warp_id, lane_id, offset, itr, col_ind_l;
+    real vals_l, sum;
 
     thread_id = blockDim.x * blockIdx.x + threadIdx.x;
     warp_id = thread_id >> 5;
@@ -201,24 +201,31 @@ CUDA_GLOBAL void k_sparse_matvec_half_opt_csr( int *row_ptr_start,
     lane_id = thread_id & 0x0000001F; 
     si = row_ptr_start[warp_id];
     ei = row_ptr_end[warp_id];
-
-    /* A symmetric, upper triangular portion stored
-     * => diagonal only contributes once */
-    if ( lane_id == 0 )
-    {
-        sum = vals[si] * x[warp_id];
-    }
-    else
-    {
-        sum = 0.0;
-    }
+    sum = 0.0;
 
     /* partial sums per thread */
-    for ( pj = si + lane_id + 1; pj < ei; pj += warpSize )
+    for ( itr = 0, pj = si + lane_id; itr < (ei - si + 0x0000001F) >> 5; ++itr )
     {
-        sum += vals[pj] * x[col_ind[pj]];
-        /* symmetric contribution to row j */
-        atomicAdd( (double *) &b[col_ind[pj]], (double) (vals[pj] * x[warp_id]) );
+        /* coaleseced 128-bit aligned reads from global memory */
+        vals_l = vals[pj];
+        col_ind_l = col_ind[pj];
+
+        /* only threads with value non-zero positions accumulate the result */
+        if ( pj < ei )
+        {
+            /* gather on x from global memory and compute partial sum for this non-zero entry */
+            sum += vals_l * x[col_ind_l];
+
+            /* A symmetric, upper triangular portion stored
+             * => diagonal only contributes once */
+            if ( pj > si )
+            {
+                /* symmetric contribution to row j */
+                atomicAdd( (double *) &b[col_ind[pj]], (double) (vals_l * x[warp_id]) );
+            }
+        }
+
+        pj += warpSize;
     }
 
     /* warp-level reduction of partial sums
@@ -285,8 +292,8 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start,
         int *row_ptr_end, int *col_ind, real *vals,
         const real * const x, real * const b, int n )
 {
-    int pj, si, ei, thread_id, warp_id, lane_id, offset;
-    real sum;
+    int pj, si, ei, thread_id, warp_id, lane_id, offset, itr, col_ind_l;
+    real vals_l, sum;
 
     thread_id = blockDim.x * blockIdx.x + threadIdx.x;
     warp_id = thread_id >> 5;
@@ -302,9 +309,20 @@ CUDA_GLOBAL void k_sparse_matvec_full_opt_csr( int *row_ptr_start,
     sum = 0.0;
 
     /* partial sums per thread */
-    for ( pj = si + lane_id; pj < ei; pj += warpSize )
+    for ( itr = 0, pj = si + lane_id; itr < (ei - si + 0x0000001F) >> 5; ++itr )
     {
-        sum += vals[pj] * x[col_ind[pj]];
+        /* coaleseced 128-bit aligned reads from global memory */
+        vals_l = vals[pj];
+        col_ind_l = col_ind[pj];
+
+        /* only threads with value non-zero positions accumulate the result */
+        if ( pj < ei )
+        {
+            /* gather on x from global memory and compute partial sum for this non-zero entry */
+            sum += vals_l * x[col_ind_l];
+        }
+
+        pj += warpSize;
     }
 
     /* warp-level reduction of partial sums
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index a0c1c88e245d4dc11bf0660c9630d6d474ecc6df..194970ac95f2ee18781c628fa6307075f3d26d20 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -176,19 +176,19 @@
 #define HB_THRESHOLD (1.0e-2)
 
 /* minimum capacity (entries) in various interaction lists */
-#define MIN_CAP (50)
+#define MIN_CAP (64)
 /* minimum number of interactions per entry in the far neighbor list */
-#define MIN_NBRS (100)
+#define MIN_NBRS (96)
 /* minimum number of non-zero entries per row in the charge matrix */
-#define MIN_CM_ENTRIES (100)
+#define MIN_CM_ENTRIES (96)
 /* minimum number of interactions per entry in the bond list */
-#define MIN_BONDS (15)
+#define MIN_BONDS (32)
 /* minimum number of interactions per entry in the hydrogen bond list */
-#define MIN_HBONDS (25)
+#define MIN_HBONDS (32)
 /* minimum capacity (entries) in 3-body interaction list */
-#define MIN_3BODIES (1000)
+#define MIN_3BODIES (1024)
 /* minimum number of atoms per grid cell */
-#define MIN_GCELL_POPL (50)
+#define MIN_GCELL_POPL (64)
 /* ??? */
 #define MIN_SEND (100)
 /* over-allocation factor for various data structures */