diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 6688eb854b91e42efb7e511295b9b133a00d5ac8..10ab72c6bbd423de03e0cf8b68bdcff4b78a0057 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -473,15 +473,11 @@ static void Calculate_Charges_QEq( reax_system const * const system,
     rvec2 my_sum, all_sum;
 #if defined(DUAL_SOLVER)
     int blocks;
-    rvec2 *spad_rvec2;
+    rvec2 *spad;
 #else
     real *spad;
 #endif
 
-    check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
-            sizeof(real) * system->n, TRUE, SAFE_ZONE,
-            "Calculate_Charges_QEq::workspace->host_scratch" );
-    q = (real *) workspace->host_scratch;
 #if defined(DUAL_SOLVER)
     blocks = system->n / DEF_BLOCK_SIZE
         + ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
@@ -489,22 +485,22 @@ static void Calculate_Charges_QEq( reax_system const * const system,
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
             sizeof(rvec2) * (blocks + 1),
             "Calculate_Charges_QEq::workspace->scratch" );
-    spad_rvec2 = (rvec2 *) workspace->scratch;
-    cuda_memset( spad_rvec2, 0, sizeof(rvec2) * (blocks + 1),
-            "Calculate_Charges_QEq::spad_rvec2" );
+    spad = (rvec2 *) workspace->scratch;
+    cuda_memset( spad, 0, sizeof(rvec2) * (blocks + 1),
+            "Calculate_Charges_QEq::spad" );
 
     /* compute local sums of pseudo-charges in s and t on device */
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
                       sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>>
-        ( workspace->d_workspace->x, spad_rvec2, system->n );
+        ( workspace->d_workspace->x, spad, system->n );
     cudaCheckError( );
 
     k_reduction_rvec2 <<< 1, ((blocks + 31) / 32) * 32,
                       sizeof(rvec2) * ((blocks + 31) / 32) >>>
-        ( spad_rvec2, &spad_rvec2[blocks], blocks );
+        ( spad, &spad[blocks], blocks );
     cudaCheckError( );
 
-    copy_host_device( &my_sum, &spad_rvec2[blocks],
+    copy_host_device( &my_sum, &spad[blocks],
             sizeof(rvec2), cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," );
 #else
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
@@ -527,6 +523,11 @@ static void Calculate_Charges_QEq( reax_system const * const system,
 
     u = all_sum[0] / all_sum[1];
 
+    check_smalloc( &workspace->host_scratch, &workspace->host_scratch_size,
+            sizeof(real) * system->n, TRUE, SAFE_ZONE,
+            "Calculate_Charges_QEq::workspace->host_scratch" );
+    q = (real *) workspace->host_scratch;
+
     /* derive atomic charges from pseudo-charges
      * and set up extrapolation for next time step */
     Extrapolate_Charges_QEq_Part2( system, workspace, q, u );
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
index 2dcd614bcc2c8ced5fb9a2486fa0827eca6ee9e8..df7c9451926fedfe5d0ce68569ed32c3d8f8352e 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
@@ -673,7 +673,7 @@ void Dot_local_rvec2( control_params const * const control,
 //    Cuda_Reduction_Sum( spad, &spad[k], k );
 
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE,
-                      sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+                      sizeof(rvec2) * (DEF_BLOCK_SIZE / 32) >>>
         ( spad, &spad[k], k );
     cudaCheckError( );
 
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index 2964a08199a509727bd33bc8c5eb158574bf3064..3a6547ffba774f70455aa254cef232957bbe238c 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -113,8 +113,8 @@ CUDA_GLOBAL void k_dual_jacobi_apply( real const * const Hdia_inv, rvec2 const *
         return;
     }
 
-    x[i][0] = y[i][0] * Hdia_inv[i];
-    x[i][1] = y[i][1] * Hdia_inv[i];
+    x[i][0] = Hdia_inv[i] * y[i][0];
+    x[i][1] = Hdia_inv[i] * y[i][1];
 }