From 036214bea373f1ff103f448ec7bedacebdc6087c Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu>
Date: Mon, 21 Nov 2016 11:11:36 -0500
Subject: [PATCH] PG-PuReMD: code clean-up.

---
 PG-PuReMD/src/cuda_linear_solvers.cu | 197 +++++++++++++++++---------
 PG-PuReMD/src/cuda_qEq.cu            |  81 +++++++----
 PG-PuReMD/src/linear_solvers.c       | 151 ++++++++++++++++----
 PG-PuReMD/src/parallelreax.c         | 203 ++++++++++++++++-----------
 PG-PuReMD/src/qEq.c                  |  51 +++++--
 5 files changed, 462 insertions(+), 221 deletions(-)

diff --git a/PG-PuReMD/src/cuda_linear_solvers.cu b/PG-PuReMD/src/cuda_linear_solvers.cu
index 1b1f510c..0bc4d4e3 100644
--- a/PG-PuReMD/src/cuda_linear_solvers.cu
+++ b/PG-PuReMD/src/cuda_linear_solvers.cu
@@ -28,225 +28,287 @@
 #include "matvec.h"
 
 
-
-void get_from_device (real *host, real *device, unsigned int bytes, char *msg)
+void get_from_device(real *host, real *device, unsigned int bytes, char *msg)
 {
-    copy_host_device (host, device, bytes, cudaMemcpyDeviceToHost, msg);
+    copy_host_device( host, device, bytes, cudaMemcpyDeviceToHost, msg );
 }
 
-void put_on_device (real *host, real *device, unsigned int bytes, char *msg)
+
+void put_on_device(real *host, real *device, unsigned int bytes, char *msg)
 {
-    copy_host_device (host, device, bytes, cudaMemcpyHostToDevice, msg);
+    copy_host_device( host, device, bytes, cudaMemcpyHostToDevice, msg );
 }
 
-void Cuda_Vector_Sum (real *res, real a, real *x, real b, real *y, int count)
+
+void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count)
 {
     //res = ax + by
     //use the cublas here
     int blocks;
+
     blocks = (count / DEF_BLOCK_SIZE) + 
         ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
         ( res, a, x, b, y, count );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-void Cuda_CG_Preconditioner (real *res, real *a, real *b, int count)
+
+void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count)
 {
     //res = a*b - vector multiplication
     //use the cublas here.
     int blocks;
+
     blocks = (count / DEF_BLOCK_SIZE) + 
         ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
         ( res, a, b, count );
-    cudaThreadSynchronize ();
+
+    cudaThreadSynchronize();
 }
 
-CUDA_GLOBAL void k_diagnol_preconditioner (storage p_workspace, rvec2 *b, int n)
+
+CUDA_GLOBAL void k_diagnol_preconditioner(storage p_workspace, rvec2 *b, int n)
 {
     storage *workspace = &( p_workspace );
     int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n) return;
+
+    if (j >= n)
+    {
+        return;
+    }
 
     //for( j = 0; j < system->n; ++j ) {
     // residual 
     workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
     workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
+
     // apply diagonal pre-conditioner
     workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
     workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
     //}
 }
 
-void Cuda_CG_Diagnol_Preconditioner (storage *workspace, rvec2 *b, int n)
+
+void Cuda_CG_Diagnol_Preconditioner(storage *workspace, rvec2 *b, int n)
 {
     int blocks;
 
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
         (*workspace, b, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-CUDA_GLOBAL void k_dual_cg_preconditioner (storage p_workspace, rvec2 *x, 
+
+CUDA_GLOBAL void k_dual_cg_preconditioner(storage p_workspace, rvec2 *x, 
         real alpha_0, real alpha_1, int n, rvec2 *my_dot)
 {
     storage *workspace = &( p_workspace );
     rvec2 alpha;
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+
     alpha[0] = alpha_0;
     alpha[1] = alpha_1;
 
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= n) return;
+    if (j >= n)
+    {
+        return;
+    }
+
     my_dot[j][0] = my_dot[j][1] = 0.0;
 
     //for( j = 0; j < system->n; ++j ) {
     // update x 
     x[j][0] += alpha[0] * workspace->d2[j][0];
     x[j][1] += alpha[1] * workspace->d2[j][1];      
+
     // update residual 
     workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
     workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
+
     // apply diagonal pre-conditioner 
     workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
     workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+
     // dot product: r.p 
     my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
     my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
     //}
 }
 
-void Cuda_DualCG_Preconditioer (storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result)
+
+void Cuda_DualCG_Preconditioer(storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result)
 {
     int blocks;
     rvec2 *tmp = (rvec2 *) scratch;
-    cuda_memset (tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner");
+    cuda_memset( tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner" );
 
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
         (*workspace, x, alpha[0], alpha[1], n, tmp);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     //Reduction to calculate my_dot
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
         ( tmp, tmp + n, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
         ( tmp + n, tmp + 2*n, blocks);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
 
-    copy_host_device (result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot");
+    cudaThreadSynchronize();
+    cudaCheckError();
+
+    copy_host_device( result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot" );
 }
 
-void Cuda_Norm (rvec2 *arr, int n, rvec2 result)
+
+void Cuda_Norm(rvec2 *arr, int n, rvec2 result)
 {
     int blocks;
     rvec2 *tmp = (rvec2 *) scratch;
 
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
         (arr, tmp, n, INITIAL);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
         (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
 
-    copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_norm_rvec2");
+    cudaThreadSynchronize();
+    cudaCheckError();
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
 }
 
-void Cuda_Dot (rvec2 *a, rvec2 *b, rvec2 result, int n)
+
+void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n)
 {
     int blocks;
     rvec2 *tmp = (rvec2 *) scratch;
 
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
         ( a, b, tmp, n );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> 
         //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> 
         ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
 
-    copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_dot");
+    cudaThreadSynchronize();
+    cudaCheckError();
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_dot" );
 }
 
-void Cuda_Vector_Sum_Rvec2 (rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
+
+void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
 {
     int blocks;
 
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
         ( x, a, b[0], b[1], c, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-CUDA_GLOBAL void k_rvec2_to_real_copy ( real *dst, rvec2 *src, int index, int n)
+
+CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if (i >= n)
+    {
+        return;
+    }
 
     dst[i] = src[i][index];
 }
 
-void Cuda_RvecCopy_From (real *dst, rvec2 *src, int index, int n)
+
+void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n)
 {
     int blocks;
+
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
         ( dst, src, index, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-CUDA_GLOBAL void k_real_to_rvec2_copy ( rvec2 *dst, real *src, int index, int n)
+
+CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if (i >= n)
+    {
+        return;
+    }
 
     dst[i][index] = src[i];
 }
 
-void Cuda_RvecCopy_To (rvec2 *dst, real *src, int index, int n)
+
+void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
 {
     int blocks;
+
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
         ( dst, src, index, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-void Cuda_Dual_Matvec (sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
+
+void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
 {
     int blocks;
+
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
-    cuda_memset (b, 0, sizeof (rvec2) * size, "dual_matvec:result");
+    cuda_memset( b, 0, sizeof (rvec2) * size, "dual_matvec:result" );
 
     //One thread per row implementation
     //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
@@ -258,21 +320,24 @@ void Cuda_Dual_Matvec (sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
 #if defined(__SM_35__)
     k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
 #else
-        k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, 
-                  sizeof (rvec2) * MATVEC_BLOCK_SIZE >>>
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                      sizeof (rvec2) * MATVEC_BLOCK_SIZE >>>
 #endif
                       (*H, a, b, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-void Cuda_Matvec (sparse_matrix *H, real *a, real *b, int n, int size)
+
+void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size)
 {
     int blocks;
+
     blocks = (n / DEF_BLOCK_SIZE) + 
         (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
-    cuda_memset (b, 0, sizeof (real) * size, "dual_matvec:result");
+    cuda_memset( b, 0, sizeof (real) * size, "dual_matvec:result" );
 
     //one thread per row implementation
     //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
@@ -283,11 +348,11 @@ void Cuda_Matvec (sparse_matrix *H, real *a, real *b, int n, int size)
 #if defined(__SM_35__)
     k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
 #else
-        k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, 
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
                  sizeof (real) * MATVEC_BLOCK_SIZE>>>
 #endif
                      (*H, a, b, n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-}
 
+    cudaThreadSynchronize();
+    cudaCheckError();
+}
diff --git a/PG-PuReMD/src/cuda_qEq.cu b/PG-PuReMD/src/cuda_qEq.cu
index b2094583..9405df86 100644
--- a/PG-PuReMD/src/cuda_qEq.cu
+++ b/PG-PuReMD/src/cuda_qEq.cu
@@ -27,15 +27,19 @@
 
 #include "validation.h"
 
+
 CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, 
         single_body_parameters *sbp, 
         storage p_workspace, int n  )
 {
     storage *workspace = &( p_workspace );
     reax_atom *atom;
-
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if (i >= n)
+    {
+        return;
+    }
 
     //for( i = 0; i < system->n; ++i ) {
     atom = &( my_atoms[i] );
@@ -54,7 +58,8 @@ CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms,
     //}
 }
 
-void Cuda_Init_MatVec ( reax_system *system, storage *workspace )
+
+void Cuda_Init_MatVec( reax_system *system, storage *workspace )
 {
     int blocks;
 
@@ -64,39 +69,48 @@ void Cuda_Init_MatVec ( reax_system *system, storage *workspace )
     ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, system->reax_param.d_sbp, 
           *dev_workspace, system->n );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
 
-void cuda_charges_x (reax_system *system, rvec2 my_sum)
+
+void cuda_charges_x(reax_system *system, rvec2 my_sum)
 {
     int blocks;
     rvec2 *output = (rvec2 *) scratch;
-    cuda_memset (output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q");
+    cuda_memset( output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q" );
 
     blocks = system->n / DEF_BLOCK_SIZE + 
         (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
 
     k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
         ( dev_workspace->x, output, system->n );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
         ( output, output + system->n, blocks );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
 
-    copy_host_device (my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x");
+    cudaThreadSynchronize();
+    cudaCheckError();
+
+    copy_host_device( my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x" );
 }
 
+
 CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, 
         real u, real *q, int n)
 {
     storage *workspace = &( p_workspace );
     reax_atom *atom;
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= n) return;
+
+    if (i >= n)
+    {
+        return;
+    }
 
     //for( i = 0; i < system->n; ++i ) {
     atom = &( my_atoms[i] );
@@ -117,7 +131,6 @@ CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace,
     atom->t[0] = workspace->x[i][1];
     //}
 }
-
 //TODO if we use the function argument (output), we are getting 
 //TODO Address not mapped/Invalid permissions error with segmentation fault
 //TODO so using the local argument, which is a global variable anyways. 
@@ -126,24 +139,27 @@ CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace,
 //TODO
 //TODO
 
+
 extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *output, real u)
 {
     int blocks;
     real *tmp = (real *) scratch;
     real *tmp_output = (real *) host_scratch;
 
-    cuda_memset (tmp, 0, sizeof (real) * system->n, "charges:q");
-    memset (tmp_output, 0, sizeof (real) * system->n);
+    cuda_memset( tmp, 0, sizeof (real) * system->n, "charges:q" );
+    memset( tmp_output, 0, sizeof (real) * system->n );
 
     blocks = system->n / DEF_BLOCK_SIZE + 
         (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
     ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, *dev_workspace, u, tmp, system->n);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
 
-    copy_host_device (output, tmp, sizeof (real) * system->n, 
-            cudaMemcpyDeviceToHost, "charges:q");
+    cudaThreadSynchronize();
+    cudaCheckError();
+
+    copy_host_device( output, tmp, sizeof (real) * system->n, 
+            cudaMemcpyDeviceToHost, "charges:q" );
 }
 //TODO
 //TODO
@@ -153,25 +169,34 @@ extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *
 //TODO
 //TODO
 
-CUDA_GLOBAL void ker_update_q (reax_atom *my_atoms, real *q, int n, int N)
+
+CUDA_GLOBAL void ker_update_q(reax_atom *my_atoms, real *q, int n, int N)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= (N-n)) return;
+
+    if (i >= (N-n))
+    {
+        return;
+    }
 
     //for( i = system->n; i < system->N; ++i )
     my_atoms[i + n].q = q[i + n];
 }
 
-void cuda_charges_updateq (reax_system *system, real *q) 
+
+void cuda_charges_updateq(reax_system *system, real *q) 
 {
     int blocks;
     real *dev_q = (real *) scratch;
-    copy_host_device (q, dev_q, system->N * sizeof (real), 
-            cudaMemcpyHostToDevice, "charges:q");
-    blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
+
+    copy_host_device( q, dev_q, system->N * sizeof (real),
+            cudaMemcpyHostToDevice, "charges:q" );
+    blocks = (system->N - system->n) / DEF_BLOCK_SIZE +
         (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+
     ker_update_q <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, dev_q, system->n, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+
+    cudaThreadSynchronize();
+    cudaCheckError();
 }
diff --git a/PG-PuReMD/src/linear_solvers.c b/PG-PuReMD/src/linear_solvers.c
index 739c7702..f4ecf2d8 100644
--- a/PG-PuReMD/src/linear_solvers.c
+++ b/PG-PuReMD/src/linear_solvers.c
@@ -40,7 +40,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
     real H;
 
     for ( i = 0; i < N; ++i )
+    {
         b[i][0] = b[i][1] = 0;
+    }
 
     /* perform multiplication */
     for ( i = 0; i < A->n; ++i )
@@ -96,18 +98,20 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 #ifdef HAVE_CUDA
     check_zeros_host (x, system->N, "x");
 #endif
+
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
+
 #ifdef HAVE_CUDA
     check_zeros_host (x, system->N, "x");
 #endif
 
     dual_Sparse_MatVec( H, x, workspace->q2, N );
 
-
 //  if (data->step > 0) return;
 
     // tryQEq
     Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
         Update_Timing_Info( &t_start, &matvec_time );
@@ -118,6 +122,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         /* residual */
         workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
         workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
+
         /* apply diagonal pre-conditioner */
         workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
         workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
@@ -148,12 +153,14 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     //fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
     MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
     //fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &dot_time );
+    }
 #endif
 
-
     for ( i = 1; i < 300; ++i )
     {
         Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer);
@@ -163,6 +170,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
         // tryQEq
         Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
             Update_Timing_Info( &t_start, &matvec_time );
@@ -200,12 +208,17 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         sig_old[1] = sig_new[1];
         MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
         //fprintf( stderr, "HOST:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
             Update_Timing_Info( &t_start, &dot_time );
 #endif
+
         if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
+        {
             break;
+        }
+
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
         for ( j = 0; j < system->n; ++j )
@@ -216,31 +229,39 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         }
     }
 
-
     if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
     {
         for ( j = 0; j < n; ++j )
+        {
             workspace->t[j] = workspace->x[j][1];
+        }
         matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
                       mpi_data, fout );
         fprintf (stderr, " CG1: iterations --> %d \n", matvecs );
         for ( j = 0; j < n; ++j )
+        {
             workspace->x[j][1] = workspace->t[j];
+        }
     }
     else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
     {
         for ( j = 0; j < n; ++j )
+        {
             workspace->s[j] = workspace->x[j][0];
+        }
         matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
                       mpi_data, fout );
         fprintf (stderr, " CG2: iterations --> %d \n", matvecs );
         for ( j = 0; j < system->n; ++j )
+        {
             workspace->x[j][0] = workspace->s[j];
+        }
     }
 
-
     if ( i >= 300 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
+    }
 
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
@@ -252,10 +273,10 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 }
 
 
-
 #ifdef HAVE_CUDA
 int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-                  rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data )
+        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
+        simulation_data *data )
 {
     int  i, j, n, N, matvecs, scale;
     rvec2 tmp, alpha, beta;
@@ -263,7 +284,6 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     rvec2 sig_old, sig_new;
     MPI_Comm comm;
     rvec2 *spad = (rvec2 *) host_scratch;
-
     int a;
 
     n = system->n;
@@ -286,13 +306,13 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
 //#endif
 
-//  check_zeros_device (x, system->N, "x");
+//  check_zeros_device( x, system->N, "x" );
 
-    get_from_device (spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get");
+    get_from_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:get" );
     Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-    put_on_device (spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put");
+    put_on_device( spad, x, sizeof (rvec2) * system->total_cap, "CG:x:put" );
 
-//  check_zeros_device (x, system->N, "x");
+//  check_zeros_device( x, system->N, "x" );
 
 //  compare_rvec2 (workspace->x, x, N, "x");
 //  if (data->step > 0) {
@@ -308,6 +328,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //#endif
     //originally we were using only H->n which was system->n (init_md.c)
     //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
+    
     Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
 
 //  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
@@ -319,6 +340,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //#ifdef __CUDA_DEBUG__
 //  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
 //#endif
+    
     get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:get" );
     Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
     put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "CG:q2:put" );
@@ -338,7 +360,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
 //  }
 //#endif
+    
     Cuda_CG_Diagnol_Preconditioner (dev_workspace, b, system->n);
+
 //  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
 //  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
 
@@ -354,6 +378,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
     my_sum[0] = my_sum[1] = 0;
     Cuda_Norm (b, n, my_sum);
+
 //  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
 
     MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
@@ -373,12 +398,18 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
     my_dot[0] = my_dot[1] = 0;
     Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
+
 // fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+    
     MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
     //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &dot_time );
+    }
 #endif
 
     for ( i = 1; i < 300; ++i )
@@ -387,15 +418,19 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //#ifdef __CUDA_DEBUG__
 //    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
 //#endif
-        get_from_device (spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:get");
-        Dist(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer);
-        put_on_device (spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:put");
+        
+        get_from_device( spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:get" );
+        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+        put_on_device( spad, dev_workspace->d2, sizeof (rvec2) * system->total_cap, "cg:d2:put" );
+
         //print_device_rvec2 (dev_workspace->d2, N);
 
 //#ifdef __CUDA_DEBUG__
 //    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
 //#endif
+        
         Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, system->total_cap );
+
         /*
         fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
         fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
@@ -409,20 +444,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         */
         //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
 
-
         // tryQEq
         // MVAPICH2
 //#ifdef __CUDA_DEBUG__
 //    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
 //#endif
-        get_from_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:get");
-        Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
-        put_on_device (spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:put");
+
+        get_from_device( spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:get" );
+        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
+        put_on_device( spad, dev_workspace->q2, sizeof (rvec2) * system->total_cap, "cg:q2:put" );
 
 //       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
 #endif
 
         /* dot product: d.q */
@@ -434,6 +472,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //    }
 //       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
 //#endif
+
         my_dot[0] = my_dot[1] = 0;
         Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
         //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
@@ -462,8 +501,10 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //    }
 //       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
 //#endif
+
         my_dot[0] = my_dot[1] = 0;
-        Cuda_DualCG_Preconditioer (dev_workspace, x, alpha, system->n, my_dot);
+        Cuda_DualCG_Preconditioer( dev_workspace, x, alpha, system->n, my_dot );
+
         //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
 
 //   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
@@ -473,13 +514,20 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         sig_old[0] = sig_new[0];
         sig_old[1] = sig_new[1];
         MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
         //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
 #endif
+
         if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
+        {
             break;
+        }
 
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
@@ -491,8 +539,9 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 //      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
 //    }
 //#endif
-        Cuda_Vector_Sum_Rvec2 (dev_workspace->d2, dev_workspace->p2, beta,
-                               dev_workspace->d2, system->n);
+
+        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
+                dev_workspace->d2, system->n );
 
 //       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
     }
@@ -503,44 +552,54 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         //for( j = 0; j < n; ++j )
         //  workspace->t[j] = workspace->x[j][1];
         //fprintf (stderr, "Getting started with Cuda_CG1 \n");
-        Cuda_RvecCopy_From (dev_workspace->t, dev_workspace->x, 1, system->n);
+
+        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
 
         //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
         //compare_array (workspace->t, dev_workspace->t, system->n, "t");
 
         matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
-                           mpi_data, fout );
+                mpi_data, fout );
+
         //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
         //for( j = 0; j < n; ++j )
         //  workspace->x[j][1] = workspace->t[j];
-        Cuda_RvecCopy_To (dev_workspace->x, dev_workspace->t, 1, system->n);
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
     }
     else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
     {
         //for( j = 0; j < n; ++j )
         //  workspace->s[j] = workspace->x[j][0];
-        Cuda_RvecCopy_From (dev_workspace->s, dev_workspace->x, 0, system->n);
+
+        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
 
         //compare_array (workspace->s, dev_workspace->s, system->n, "s");
         //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
 
         //fprintf (stderr, "Getting started with Cuda_CG2 \n");
+
         matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
-                           mpi_data, fout );
+                mpi_data, fout );
+
         //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
         //for( j = 0; j < system->n; ++j )
         //  workspace->x[j][0] = workspace->s[j];
-        Cuda_RvecCopy_To (dev_workspace->x, dev_workspace->s, 0, system->n);
-    }
 
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
+    }
 
     if ( i >= 300 )
+    {
         fprintf( stderr, "Dual CG convergence failed! -> %d\n", i );
+    }
 
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
-                 i + 1, matvecs, matvec_time, dot_time );
+                i + 1, matvecs, matvec_time, dot_time );
+    }
 #endif
 
     return (i + 1) + matvecs;
@@ -554,7 +613,9 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
     real H;
 
     for ( i = 0; i < N; ++i )
+    {
         b[i] = 0;
+    }
 
     /* perform multiplication */
     for ( i = 0; i < A->n; ++i )
@@ -583,34 +644,47 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
     scale = sizeof(real) / sizeof(void);
     Dist( system, mpi_data, x, MPI_DOUBLE, scale, real_packer );
     Sparse_MatVec( H, x, workspace->q, system->N );
+
     // tryQEq
     Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &matvec_time );
+    }
 #endif
 
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
     for ( j = 0; j < system->n; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    }
 
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
     sig0 = sig_new;
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &dot_time );
+    }
 #endif
 
     for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
     {
         Dist( system, mpi_data, workspace->d, MPI_DOUBLE, scale, real_packer );
         Sparse_MatVec( H, workspace->d, workspace->q, system->N );
+
         //tryQEq
         Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
 #endif
 
         tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
@@ -619,16 +693,21 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
         /* pre-conditioning */
         for ( j = 0; j < system->n; ++j )
+        {
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
         //fprintf (stderr, "Host : sig_new: %f \n", sig_new );
         beta = sig_new / sig_old;
         Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
 #endif
     }
 
@@ -662,12 +741,15 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     //MVAPICH2
     put_on_device (spad, x, sizeof (real) * system->total_cap , "cuda_cg:x:put");
     Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
+
     // tryQEq
     // MVAPICH2
     get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" );
     Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+
     //MVAPICH2
     put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:put" );
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
         Update_Timing_Info( &t_start, &matvec_time );
@@ -688,27 +770,33 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     sig_new = Parallel_Dot(spad, spad + system->total_cap, system->n, mpi_data->world);
 
     sig0 = sig_new;
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &dot_time );
+    }
 #endif
 
     for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
     {
-
         //MVAPICH2
         get_from_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:get");
         Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
         put_on_device (spad, dev_workspace->d, sizeof (real) * system->total_cap, "cuda_cg:d:put");
 
         Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
+
         //tryQEq
         get_from_device (spad, dev_workspace->q, sizeof (real) * system->total_cap, "cuda_cg:q:get" );
         Coll(system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker);
         put_on_device (spad, dev_workspace->q, sizeof (real) * system->total_cap , "cuda_cg:q:get");
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
 #endif
 
         //TODO do the parallel dot on the device for the local sum
@@ -737,9 +825,12 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
         beta = sig_new / sig_old;
         Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, dev_workspace->d, system->n );
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
 #endif
     }
 
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 40cc9c3b..5020d59b 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -52,7 +52,6 @@ LR_lookup_table *d_LR;
 ////////////////////////////
 //CUDA SPECIFIC DECLARATIONS
 ////////////////////////////
-
 reax_list   **dev_lists;
 storage         *dev_workspace;
 void            *scratch;
@@ -64,9 +63,8 @@ int         MATVEC_BLOCKS;
 
 
 void Read_System( char *geo_file, char *ffield_file, char *control_file,
-                  reax_system *system, control_params *control,
-                  simulation_data *data, storage *workspace,
-                  output_controls *out_control, mpi_datatypes *mpi_data )
+        reax_system *system, control_params *control, simulation_data *data,
+        storage *workspace, output_controls *out_control, mpi_datatypes *mpi_data )
 {
     /* ffield file */
     Read_Force_Field( ffield_file, &(system->reax_param), control );
@@ -76,9 +74,13 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file,
 
     /* geo file */
     if ( control->geo_format == CUSTOM )
+    {
         Read_Geo( geo_file, system, control, data, workspace, mpi_data );
+    }
     else if ( control->geo_format == PDB )
+    {
         Read_PDB( geo_file, system, control, data, workspace, mpi_data );
+    }
     else if ( control->geo_format == ASCII_RESTART )
     {
         Read_Restart( geo_file, system, control, data, workspace, mpi_data );
@@ -98,9 +100,8 @@ void Read_System( char *geo_file, char *ffield_file, char *control_file,
 
 
 void Post_Evolve( reax_system* system, control_params* control,
-                  simulation_data* data, storage* workspace,
-                  reax_list** lists, output_controls *out_control,
-                  mpi_datatypes *mpi_data )
+        simulation_data* data, storage* workspace, reax_list** lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     int i;
     rvec diff, cross;
@@ -131,9 +132,8 @@ void Post_Evolve( reax_system* system, control_params* control,
 
 #ifdef HAVE_CUDA
 void Cuda_Post_Evolve( reax_system* system, control_params* control,
-                       simulation_data* data, storage* workspace,
-                       reax_list** lists, output_controls *out_control,
-                       mpi_datatypes *mpi_data )
+        simulation_data* data, storage* workspace, reax_list** lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
 {
     /* remove trans & rot velocity of the center of mass from system */
     if ( control->ensemble != NVE && control->remove_CoM_vel &&
@@ -152,20 +152,20 @@ void Cuda_Post_Evolve( reax_system* system, control_params* control,
 
 
 #ifdef HAVE_CUDA
-void init_blocks (reax_system *system)
+void init_blocks(reax_system *system)
 {
-    compute_blocks (&BLOCKS, &BLOCK_SIZE, system->n);
-    compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2);
+    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
+    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
 
-    compute_blocks (&BLOCKS_N, &BLOCK_SIZE, system->N);
-    compute_nearest_pow_2 (BLOCKS_N, &BLOCKS_POW_2_N);
+    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
+    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
 
-    compute_matvec_blocks (&MATVEC_BLOCKS, system->N);
-    //#if defined(__CUDA_DEBUG_LOG__)
-    //fprintf (stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
-    //                      MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
-    //#endif
+    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
 
+#if defined(__CUDA_DEBUG_LOG__)
+    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
+            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
+#endif
 }
 #endif
 
@@ -218,7 +218,7 @@ int main( int argc, char* argv[] )
     fprintf (stderr, " General block size: %d \n",  DEF_BLOCK_SIZE);
 #endif
 
-    /* allocated main datastructures */
+    /* allocate main data structures */
     system = (reax_system *) smalloc( sizeof(reax_system), "system" );
     control = (control_params *) smalloc( sizeof(control_params), "control" );
     data = (simulation_data *) smalloc( sizeof(simulation_data), "data" );
@@ -236,13 +236,11 @@ int main( int argc, char* argv[] )
         lists[i]->end_index = NULL;
         lists[i]->select.v = NULL;
     }
-    out_control = (output_controls *)
-                  smalloc( sizeof(output_controls), "out_control" );
+    out_control = (output_controls *) smalloc( sizeof(output_controls), "out_control" );
     mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" );
 
     /* allocate the cuda auxiliary data structures */
     dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" );
-
     dev_lists = (reax_list **) smalloc ( LIST_N * sizeof (reax_list *), "dev_lists" );
     for ( i = 0; i < LIST_N; ++i )
     {
@@ -253,8 +251,7 @@ int main( int argc, char* argv[] )
     /* Initialize member variables */
     system->init_thblist = FALSE;
 
-
-    /* setup the parallel environment */
+    /* setup MPI environment */
     MPI_Init( &argc, &argv );
     MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
     MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
@@ -263,38 +260,46 @@ int main( int argc, char* argv[] )
 
     /* setup the CUDA Device for this process can be on the same machine
     * or on a different machine, for now use the rank to compute the device
-    * This will only work on a single machine with 2 GPUs*/
-
-    Setup_Cuda_Environment (system->my_rank, control->nprocs, control->gpus_per_node);
+    * This will only work on a single machine with 2 GPUs */
+    Setup_Cuda_Environment( system->my_rank, control->nprocs, control->gpus_per_node );
     //Cleanup_Cuda_Environment ();
+    //
+#if defined(DEBUG)
     print_device_mem_usage ();
-    //fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id);
+    fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id);
+#endif
 
-    /* read system description files */
+    /* read system config files */
     Read_System( argv[1], argv[2], argv[3], system, control,
-                 data, workspace, out_control, mpi_data );
+            data, workspace, out_control, mpi_data );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
     /* init the blocks sizes for cuda kernels */
-    init_blocks (system);
+    init_blocks( system );
 
     /* measure total simulation time after input is read */
     if ( system->my_rank == MASTER_NODE )
+    {
         t_start = Get_Time( );
+    }
 
-    /* initialize datastructures */
+    /* initialize data structures */
     Cuda_Initialize( system, control, data, workspace, lists, out_control, mpi_data );
+
 #if defined(__CUDA_DEBUG__)
     Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
 
+#if defined(DEBUG)
     print_device_mem_usage ();
+#endif
 
     /* init the blocks sizes for cuda kernels */
-    init_blocks (system);
+    init_blocks( system );
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
@@ -304,49 +309,60 @@ int main( int argc, char* argv[] )
 
     // compute f_0
     Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
-    Sync_Atoms ( system );
-    Sync_Grid (&system->my_grid, &system->d_my_grid);
+    Sync_Atoms( system );
+    Sync_Grid( &system->my_grid, &system->d_my_grid );
     init_blocks (system);
+
 #if defined(__CUDA_DENUG_LOG__)
     fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank );
 #endif
 
     //Second step
     Cuda_Reset ( system, control, data, workspace, lists );
+
 #if defined(__CUDA_DEBUG__)
     Reset( system, control, data, workspace, lists );
 #endif
-    //fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
-
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
+#endif
 
     //Third Step
     Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+
 #if defined(__CUDA_DEBUG__)
     Generate_Neighbor_Lists( system, data, workspace, lists );
 #endif
-#if defined(__CUDA_DENUG_LOG__)
+
+#if defined(DEBUG)
     fprintf (stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank );
 #endif
 
 
     //Fourth Step
-#if defined(__CUDA_DEBUG__)
+#if defined(DEBUG)
     fprintf (stderr, " Host Compute Forces begin.... \n");
+#endif
+
+#if defined(__CUDA_DEBUG__)
     Compute_Forces( system, control, data, workspace,
                     lists, out_control, mpi_data );
 #endif
-    Cuda_Compute_Forces( system, control, data, workspace,
-                         lists, out_control, mpi_data );
-#if defined(__CUDA_DENUG_LOG__)
+
+    Cuda_Compute_Forces( system, control, data, workspace, lists,
+            out_control, mpi_data );
+
+#if defined(DEBUG)
     fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank );
 #endif
 
-
 #if defined (__CUDA_DEBUG__)
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 #endif
+
     Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-#if defined(__CUDA_DENUG_LOG__)
+
+#if defined(DEBUG)
     fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank);
 #endif
 
@@ -356,7 +372,9 @@ int main( int argc, char* argv[] )
 
 #if !defined(__CUDA_DEBUG__)
     Output_Results( system, control, data, lists, out_control, mpi_data );
-    //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
+#endif
+#if defined(DEBUG)
+    fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
 #endif
 
 #if defined(DEBUG)
@@ -368,26 +386,39 @@ int main( int argc, char* argv[] )
     for ( ++data->step; data->step <= control->nsteps; data->step++ )
     {
         if ( control->T_mode )
+        {
             Temperature_Control( control, data );
+        }
 
-        //t_begin = Get_Time ();
+#if defined(DEBUG)
+        t_begin = Get_Time();
+#endif
 
 #if defined(__CUDA_DEBUG__)
         Evolve( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
+
         Cuda_Evolve( system, control, data, workspace, lists, out_control, mpi_data );
 
-        //t_end = Get_Timing_Info (t_begin);
-        //fprintf (stderr, " Evolve time: %f \n", t_end);
+#if defined(DEBUG)
+        t_end = Get_Timing_Info( t_begin );
+        fprintf( stderr, " Evolve time: %f \n", t_end );
+#endif
+
+#if defined(DEBUG)
+        t_begin = Get_Time();
+#endif
 
-        //t_begin = Get_Time ();
         Cuda_Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+
 #if defined(__CUDA_DEBUG__)
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
 #endif
 
-        //t_end = Get_Timing_Info (t_begin);
-        //fprintf (stderr, " Post Evolve time: %f \n", t_end);
+#if defined(DEBUG)
+        t_end = Get_Timing_Info( t_begin );
+        fprintf( stderr, " Post Evolve time: %f \n", t_end );
+#endif
 
 #if !defined(__CUDA_DEBUG__)
         Output_Results( system, control, data, lists, out_control, mpi_data );
@@ -403,6 +434,7 @@ int main( int argc, char* argv[] )
 //      else if( out_control->restart_format == WRITE_BINARY )
 //  Write_Binary_Restart( system, control, data, out_control, mpi_data );
 //    }
+
 #if defined(DEBUG)
         fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
         MPI_Barrier( MPI_COMM_WORLD );
@@ -410,32 +442,22 @@ int main( int argc, char* argv[] )
 
     }
 
-    //vaildate the results in debug mode
 #if defined(__CUDA_DEBUG__)
+    // vaildate the results in debug mode
     validate_device (system, data, workspace, lists);
 #endif
 
 #else 
 
-    /* allocated main datastructures */
-    system = (reax_system *)
-             smalloc( sizeof(reax_system), "system" );
-    control = (control_params *)
-              smalloc( sizeof(control_params), "control" );
-    data = (simulation_data *)
-           smalloc( sizeof(simulation_data), "data" );
-    workspace = (storage *)
-                smalloc( sizeof(storage), "workspace" );
-
-    lists = (reax_list **)
-            smalloc( LIST_N * sizeof(reax_list*), "lists" );
+    /* allocate main data structures */
+    system = (reax_system *) smalloc( sizeof(reax_system), "system" );
+    control = (control_params *) smalloc( sizeof(control_params), "control" );
+    data = (simulation_data *) smalloc( sizeof(simulation_data), "data" );
+    workspace = (storage *) smalloc( sizeof(storage), "workspace" );
+
+    lists = (reax_list **) smalloc( LIST_N * sizeof(reax_list*), "lists" );
     for ( i = 0; i < LIST_N; ++i )
     {
-/*
-        lists[i] = (reax_list *)
-                   smalloc( sizeof(reax_list), "lists[i]" );
-        lists[i]->allocated = 0;*/
-
 	lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
         lists[i]->allocated = 0;
 
@@ -446,14 +468,11 @@ int main( int argc, char* argv[] )
         lists[i]->end_index = NULL;
         lists[i]->select.v = NULL;
     }
-    out_control = (output_controls *)
-                  smalloc( sizeof(output_controls), "out_control" );
-    mpi_data = (mpi_datatypes *)
-               smalloc( sizeof(mpi_datatypes), "mpi_data" );
+    out_control = (output_controls *) smalloc( sizeof(output_controls), "out_control" );
+    mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" );
 
     /* allocate the cuda auxiliary data structures */
     dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" );
-
     dev_lists = (reax_list **) smalloc ( LIST_N * sizeof (reax_list *), "dev_lists" );
     for ( i = 0; i < LIST_N; ++i )
     {
@@ -464,17 +483,18 @@ int main( int argc, char* argv[] )
     /* Initialize member variables */
     system->init_thblist = FALSE;
 
-    /* setup the parallel environment */
+    /* setup MPI environment */
     MPI_Init( &argc, &argv );
     MPI_Comm_size( MPI_COMM_WORLD, &(control->nprocs) );
     MPI_Comm_rank( MPI_COMM_WORLD, &(system->my_rank) );
     system->wsize = control->nprocs;
-    system->global_offset = (int*)
-                            scalloc( system->wsize + 1, sizeof(int), "global_offset" );
+    system->global_offset = (int*) scalloc( system->wsize + 1,
+            sizeof(int), "global_offset" );
 
-    /* read system description files */
+    /* read system config files */
     Read_System( argv[1], argv[2], argv[3], system, control,
-                 data, workspace, out_control, mpi_data );
+            data, workspace, out_control, mpi_data );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: read simulation info\n", system->my_rank );
     MPI_Barrier( MPI_COMM_WORLD );
@@ -482,7 +502,10 @@ int main( int argc, char* argv[] )
 
     /* measure total simulation time after input is read */
     if ( system->my_rank == MASTER_NODE )
+    {
         t_start = Get_Time( );
+    }
+
     /* initialize datastructures */
     Initialize( system, control, data, workspace, lists, out_control, mpi_data );
    
@@ -494,12 +517,16 @@ int main( int argc, char* argv[] )
     /* compute f_0 */
     Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
     Reset( system, control, data, workspace, lists );
-    //Print_List(*lists + BONDS);
+
+#if defined(DEBUG)
+    Print_List(*lists + BONDS);
+#endif
+
     Generate_Neighbor_Lists( system, data, workspace, lists );
-    Compute_Forces( system, control, data, workspace,
-                    lists, out_control, mpi_data );
+    Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data );
     Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
     Output_Results( system, control, data, lists, out_control, mpi_data );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
     MPI_Barrier( mpi_data->world );
@@ -509,7 +536,9 @@ int main( int argc, char* argv[] )
     for ( ++data->step; data->step <= control->nsteps; data->step++ )
     {
         if ( control->T_mode )
+        {
             Temperature_Control( control, data );
+        }
 
         Evolve( system, control, data, workspace, lists, out_control, mpi_data );
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
@@ -521,10 +550,15 @@ int main( int argc, char* argv[] )
                 (data->step - data->prev_steps) % out_control->restart_freq == 0 )
         {
             if ( out_control->restart_format == WRITE_ASCII )
+            {
                 Write_Restart( system, control, data, out_control, mpi_data );
+            }
             else if ( out_control->restart_format == WRITE_BINARY )
+            {
                 Write_Binary_Restart( system, control, data, out_control, mpi_data );
+            }
         }
+
 #if defined(DEBUG)
         fprintf( stderr, "p%d: step%d completed\n", system->my_rank, data->step );
         MPI_Barrier( mpi_data->world );
@@ -547,7 +581,6 @@ int main( int argc, char* argv[] )
 
     MPI_Finalize();
 
-
     /* de-allocate data structures */
     //for( i = 0; i < LIST_N; ++i ) {
     //if (lists[i]->index) free (lists[i]->index);
diff --git a/PG-PuReMD/src/qEq.c b/PG-PuReMD/src/qEq.c
index 76966044..b4451fc7 100644
--- a/PG-PuReMD/src/qEq.c
+++ b/PG-PuReMD/src/qEq.c
@@ -33,6 +33,7 @@
 #include "validation.h"
 #endif
 
+
 int compare_matrix_entry(const void *v1, const void *v2)
 {
     return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
@@ -48,7 +49,7 @@ void Sort_Matrix_Rows( sparse_matrix *A )
         si = A->start[i];
         ei = A->end[i];
         qsort( &(A->entries[si]), ei - si,
-               sizeof(sparse_matrix_entry), compare_matrix_entry );
+                sizeof(sparse_matrix_entry), compare_matrix_entry );
     }
 }
 
@@ -60,13 +61,16 @@ void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
 
     /* init droptol to 0 - not necessary for an upper-triangular A */
     for ( i = 0; i < A->n; ++i )
+    {
         droptol[i] = 0;
+    }
 
     /* calculate sqaure of the norm of each row */
     for ( i = 0; i < A->n; ++i )
     {
         val = A->entries[A->start[i]].val; // diagonal entry
         droptol[i] += val * val;
+
         // only within my block
         for ( k = A->start[i] + 1; A->entries[k].j < A->n; ++k )
         {
@@ -97,6 +101,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
 
     fillin = 0;
     for ( i = 0; i < A->n; ++i )
+    {
         for ( pj = A->start[i] + 1; A->entries[pj].j < A->n; ++pj )
         {
             j = A->entries[pj].j;
@@ -104,13 +109,14 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
             if ( fabs(val) > droptol[i] )
                 ++fillin;
         }
+    }
 
     return fillin + A->n;
 }
 
 
 void ICHOLT( sparse_matrix *A, real *droptol,
-             sparse_matrix *L, sparse_matrix *U )
+        sparse_matrix *L, sparse_matrix *U )
 {
     sparse_matrix_entry tmp[1000];
     int i, j, pj, k1, k2, tmptop, Utop;
@@ -123,7 +129,9 @@ void ICHOLT( sparse_matrix *A, real *droptol,
     Utop = 0;
     tmptop = 0;
     for ( i = 0; i < A->n; ++i )
+    {
         U->start[i] = L->start[i] = L->end[i] = Ltop[i] = 0;
+    }
 
     for ( i = A->n - 1; i >= 0; --i )
     {
@@ -146,11 +154,17 @@ void ICHOLT( sparse_matrix *A, real *droptol,
                 while ( k1 >= 0 && k2 < U->end[j] )
                 {
                     if ( tmp[k1].j < U->entries[k2].j )
+                    {
                         k1--;
+                    }
                     else if ( tmp[k1].j > U->entries[k2].j )
+                    {
                         k2++;
+                    }
                     else
+                    {
                         val -= (tmp[k1--].val * U->entries[k2++].val);
+                    }
                 }
 
                 // U matrix is upper triangular
@@ -168,8 +182,10 @@ void ICHOLT( sparse_matrix *A, real *droptol,
         dval = A->entries[A->start[i]].val;
         //fprintf( stdout, "%d %d %24.16f\n", 6540-i, 6540-i, dval );
         for ( k1 = 0; k1 < tmptop; ++k1 )
+        {
             //if( fabs(tmp[k1].val) > droptol[i] )
             dval -= SQR(tmp[k1].val);
+        }
         dval = SQRT(dval);
         // keep the diagonal in any case
         U->entries[Utop].j = i;
@@ -193,14 +209,17 @@ void ICHOLT( sparse_matrix *A, real *droptol,
         U->end[i] = Utop;
         //fprintf( stderr, "i = %d - Utop = %d\n", i, Utop );
     }
-    // print matrix U
+
 #if defined(DEBUG)
+    // print matrix U
     fprintf( stderr, "nnz(U): %d\n", Utop );
     for ( i = 0; i < U->n; ++i )
     {
         fprintf( stderr, "row%d: ", i );
         for ( pj = U->start[i]; pj < U->end[i]; ++pj )
+        {
             fprintf( stderr, "%d ", U->entries[pj].j );
+        }
         fprintf( stderr, "\n" );
     }
 #endif
@@ -214,6 +233,7 @@ void ICHOLT( sparse_matrix *A, real *droptol,
     }
 
     for ( i = 0; i < U->n; ++i )
+    {
         for ( pj = U->start[i]; pj < U->end[i]; ++pj )
         {
             j = U->entries[pj].j;
@@ -221,25 +241,28 @@ void ICHOLT( sparse_matrix *A, real *droptol,
             L->entries[L->end[j]].val = U->entries[pj].val;
             L->end[j]++;
         }
+    }
 
-    // print matrix L
 #if defined(DEBUG)
+    // print matrix L
     fprintf( stderr, "nnz(L): %d\n", L->end[L->n - 1] );
     for ( i = 0; i < L->n; ++i )
     {
         fprintf( stderr, "row%d: ", i );
         for ( pj = L->start[i]; pj < L->end[i]; ++pj )
+        {
             fprintf( stderr, "%d ", L->entries[pj].j );
+        }
         fprintf( stderr, "\n" );
     }
 #endif
+
     sfree( Ltop, "Ltop" );
 }
 
 
 void Init_MatVec( reax_system *system, simulation_data *data,
-                  control_params *control,  storage *workspace,
-                  mpi_datatypes *mpi_data )
+        control_params *control,  storage *workspace, mpi_datatypes *mpi_data )
 {
     int i; //, fillin;
     reax_atom *atom;
@@ -312,9 +335,8 @@ void Init_MatVec( reax_system *system, simulation_data *data,
 }
 
 
-
 void Calculate_Charges( reax_system *system, storage *workspace,
-                        mpi_datatypes *mpi_data )
+        mpi_datatypes *mpi_data )
 {
     int        i, scale;
     real       u;//, s_sum, t_sum;
@@ -362,7 +384,9 @@ void Calculate_Charges( reax_system *system, storage *workspace,
 
     Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
     for ( i = system->n; i < system->N; ++i )
+    {
         system->my_atoms[i].q = q[i];
+    }
 
     free(q);
 }
@@ -381,9 +405,9 @@ void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
 
     scale = sizeof(real) / sizeof(void);
     q =  (real *) host_scratch;
-    memset ( q, 0, system->N * sizeof (real));
+    memset( q, 0, system->N * sizeof (real));
 
-    cuda_charges_x ( system, my_sum );
+    cuda_charges_x( system, my_sum );
     //fprintf (stderr, "Device: my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]);
 
     MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
@@ -391,11 +415,11 @@ void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
     u = all_sum[0] / all_sum[1];
     //fprintf (stderr, "Device: u: %f \n", u);
 
-    cuda_charges_st ( system, workspace, q, u );
+    cuda_charges_st( system, workspace, q, u );
 
     Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
 
-    cuda_charges_updateq ( system, q );
+    cuda_charges_updateq( system, q );
 }
 #endif
 
@@ -412,6 +436,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     //if( data->step == 50010 ) {
     //  Print_Linear_System( system, control, workspace, data->step );
     // }
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
     //Print_Linear_System( system, control, workspace, data->step );
@@ -428,6 +453,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s,
     //   control->q_err, workspace->L, workspace->U, workspace->s,
     //   mpi_data, out_control->log );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
 #endif
@@ -493,6 +519,7 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da
 #endif
 
     Cuda_Calculate_Charges( system, workspace, mpi_data );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: computed charges\n", system->my_rank );
     //Print_Charges( system );
-- 
GitLab