diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index baec5a9f8ca3570709cb9362c4e7f9cdb1e601bb..7379596ffeaa83061704997a0136e746b75aff0d 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -335,10 +335,11 @@ static void Calculate_Charges_QEq( reax_system const * const system,
 
         /* compute charge based on s & t */
 #if defined(DUAL_SOLVER)
-        q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
+        atom->q = workspace->x[i][0] - u * workspace->x[i][1];
 #else
-        q[i] = atom->q = workspace->s[i] - u * workspace->t[i];
+        atom->q = workspace->s[i] - u * workspace->t[i];
 #endif
+        q[i] = atom->q;
 
         /* update previous solutions in s & t */
         atom->s[3] = atom->s[2];
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index eeee26df7024798abd7d0a8d6355021d88f541da..4128585242230a9b46c9ce65f919bbe45f4bec1a 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -364,8 +364,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control,
 {
     int total_real, total_rvec, local_rvec;
 
-    workspace->allocated = TRUE;
-
     total_real = sizeof(real) * total_cap;
     total_rvec = sizeof(rvec) * total_cap;
     local_rvec = sizeof(rvec) * local_cap;
@@ -515,13 +513,6 @@ void Cuda_Allocate_Workspace( reax_system *system, control_params *control,
 
 void Cuda_Deallocate_Workspace( control_params *control, storage *workspace )
 {
-    if ( workspace->allocated == FALSE )
-    {
-        return;
-    }
-
-    workspace->allocated = FALSE;
-
     /* bond order related storage  */
     cuda_free( workspace->total_bond_order, "total_bo" );
     cuda_free( workspace->Deltap, "Deltap" );
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 93884ab48c9b427572e1b2ec0a717840b1d0e4d3..f9c34d87c950fa83de880cc674d8025ba7551c69 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -58,13 +58,13 @@ static void jacobi( reax_system const * const system,
     int blocks;
 
     blocks = system->n / DEF_BLOCK_SIZE
-        + (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+        + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
 
     k_jacobi <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize();
-    cudaCheckError();
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -245,8 +245,8 @@ static void Spline_Extrapolate_Charges_QEq( reax_system const * const system,
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           (control_params *)control->d_control_params,
           *(workspace->d_workspace), system->n );
-    cudaDeviceSynchronize();
-    cudaCheckError();
+    cudaDeviceSynchronize( );
+    cudaCheckError( );
 }
 
 
@@ -376,6 +376,7 @@ CUDA_GLOBAL void k_extrapolate_charges_qeq_part2( reax_atom *my_atoms,
         return;
     }
 
+    /* compute charge based on s & t */
 #if defined(DUAL_SOLVER)
     my_atoms[i].q = workspace.x[i][0] - u * workspace.x[i][1];
 #else
@@ -525,7 +526,8 @@ static void Calculate_Charges_QEq( reax_system const * const system,
 #endif
 
     /* global reduction on pseudo-charges for s and t */
-    ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    ret = MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE,
+            MPI_SUM, MPI_COMM_WORLD );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
 
     u = all_sum[0] / all_sum[1];
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index b0e7f70c15733a604d449252aea81bb423dcc100..fdb63082f0b21cd7fec7942581d50f2f3c45d899 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -294,7 +294,17 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_csr( sparse_matrix A,
 }
 
 
-CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A,
+/* sparse matrix, dense vector multiplication AX = B,
+ * where warps of 32 threads
+ * collaborate to multiply each row
+ *
+ * A: symmetric, full, square matrix,
+ *    stored in CSR format
+ * X: 2 dense vectors, size equal to num. columns in A
+ * B (output): 2 dense vectors, size equal to num. columns in A
+ * N: number of rows in A */
+CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( int *row_ptr_start,
+        int *row_ptr_end, int *col_ind, real *vals,
         rvec2 const * const x, rvec2 * const b, int n )
 {
     int i, pj, thread_id, warp_id, lane_id, offset;
@@ -310,23 +320,27 @@ CUDA_GLOBAL void k_dual_sparse_matvec_full_opt_csr( sparse_matrix A,
     if ( warp_id < n )
     {
         i = warp_id;
+        si = row_ptr_start[i];
+        ei = row_ptr_end[i];
         sum[0] = 0.0;
         sum[1] = 0.0;
-        si = A.start[i];
-        ei = A.end[i];
 
+        /* partial sums per thread */
         for ( pj = si + lane_id; pj < ei; pj += warpSize )
         {
-            sum[0] += A.val[pj] * x[ A.j[pj] ][0];
-            sum[1] += A.val[pj] * x[ A.j[pj] ][1];
+            sum[0] += vals[pj] * x[ col_ind[pj] ][0];
+            sum[1] += vals[pj] * x[ col_ind[pj] ][1];
         }
 
+        /* warp-level reduction of partial sums
+         * using registers within a warp */
         for ( offset = warpSize >> 1; offset > 0; offset /= 2 )
         {
             sum[0] += __shfl_down_sync( mask, sum[0], offset );
             sum[1] += __shfl_down_sync( mask, sum[1], offset );
         }
 
+        /* first thread within a warp writes sum to global memory */
         if ( lane_id == 0 )
         {
             b[i][0] = sum[0];
@@ -418,7 +432,7 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         sparse_matrix const * const A, rvec2 const * const x,
         rvec2 * const b, int n )
 {
-//    int blocks;
+    int blocks;
 
     if ( A->format == SYM_HALF_MATRIX )
     {
@@ -445,11 +459,14 @@ static void Dual_Sparse_MatVec_local( control_params const * const control,
         /* 1 thread per row implementation */
 //        k_dual_sparse_matvec_full_csr <<< control->blocks_n, control->blocks_size_n >>>
 //             ( *A, x, b, n );
+
+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
         
-        /* one warp per row implementation,
-         * with shared memory to accumulate partial row sums */
-        k_dual_sparse_matvec_full_opt_csr <<< control->blocks_n, control->block_size_n >>>
-                ( *A, x, b, n );
+        /* multiple threads per row implementation
+         * using registers to accumulate partial row sums */
+        k_dual_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
+                ( A->start, A->end, A->j, A->val, x, b, n );
         cudaDeviceSynchronize( );
         cudaCheckError( );
     }
@@ -621,13 +638,13 @@ static void Sparse_MatVec_local( control_params const * const control,
     }
     else if ( A->format == SYM_FULL_MATRIX )
     {
-        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
-            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
         /* 1 thread per row implementation */
 //        k_sparse_matvec_full_csr <<< control->blocks, control->blocks_size >>>
 //             ( A->start, A->end, A->j, A->val, x, b, n );
 
+        blocks = (A->n * 32 / DEF_BLOCK_SIZE)
+            + ((A->n * 32 % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
         /* multiple threads per row implementation
          * using registers to accumulate partial row sums */
         k_sparse_matvec_full_opt_csr <<< blocks, DEF_BLOCK_SIZE >>>
@@ -672,12 +689,12 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
                 "Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
         spad = (real *) workspace->host_scratch;
         copy_host_device( spad, b, sizeof(real) * n1,
-                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::q" );
+                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::b" );
 
         Coll( system, mpi_data, spad, buf_type, mpi_type );
 
         copy_host_device( spad, b, sizeof(real) * n2,
-                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::q" );
+                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::b" );
 //#endif
     }
 }
@@ -939,8 +956,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 
     redux[0] = Dot_local( workspace, workspace->d_workspace->r,
             workspace->d_workspace->d, system->n );
-    redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-            workspace->d_workspace->r, system->n );
+    redux[1] = Dot_local( workspace, workspace->d_workspace->d,
+            workspace->d_workspace->d, system->n );
     redux[2] = Dot_local( workspace, b, b, system->n );
 
 #if defined(LOG_PERFORMANCE)
@@ -988,8 +1005,8 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont
 
         redux[0] = Dot_local( workspace, workspace->d_workspace->r,
                 workspace->d_workspace->p, system->n );
-        redux[1] = Dot_local( workspace, workspace->d_workspace->r,
-                workspace->d_workspace->r, system->n );
+        redux[1] = Dot_local( workspace, workspace->d_workspace->p,
+                workspace->d_workspace->p, system->n );
 
 #if defined(LOG_PERFORMANCE)
         Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops );