From ce6b6624af77182154f3297e6428026b95ae1152 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu>
Date: Tue, 22 Nov 2016 15:30:26 -0500
Subject: [PATCH] PuReMD-GPU: code clean-up.

---
 PuReMD-GPU/src/GMRES.cu     | 485 +++++++++++++++++++-----------------
 PuReMD-GPU/src/QEq.cu       | 121 ++++++---
 PuReMD/src/linear_solvers.c |  71 +++++-
 3 files changed, 403 insertions(+), 274 deletions(-)

diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu
index d00100e9..2c1b0cf6 100644
--- a/PuReMD-GPU/src/GMRES.cu
+++ b/PuReMD-GPU/src/GMRES.cu
@@ -220,7 +220,6 @@ int GMRES( static_storage *workspace, sparse_matrix *H,
 /////////////////////////////////////////////////////////////////
 //Cuda Functions for GMRES implementation
 /////////////////////////////////////////////////////////////////
-
 GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -230,6 +229,7 @@ GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv
     b_proc [i] = b[i] * Hdia_inv[i];
 }
 
+
 GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output)
 {
     real tmp1, tmp2, cc;
@@ -256,6 +256,7 @@ GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j,
     output[1] = tmp2;
 }
 
+
 GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y)
 {
     real temp;
@@ -275,25 +276,26 @@ int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x )
     real cc, tmp1, tmp2, temp, bnorm;
     real v_add_tmp;
     sparse_matrix *H = &workspace->H;
-
     real t_start, t_elapsed;
-
     real *spad = (real *)scratch;
     real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
 
     N = H->n;
 
-    cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+    cuda_memset(spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
 
-    Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+        (b, spad, H->n, INITIAL);
+    cudaThreadSynchronize();
+    cudaCheckError();
 
-    Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+        (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
+    cudaThreadSynchronize();
+    cudaCheckError();
 
-    copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE,
+            cudaMemcpyDeviceToHost, __LINE__);
 
 #ifdef __DEBUG_CUDA__
     fprintf (stderr, "Norm of the array is %e \n", bnorm );
@@ -302,138 +304,158 @@ int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x )
     /* apply the diagonal pre-conditioner to rhs */
     GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
         (workspace->b_prc, b, workspace->Hdia_inv, N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize();
+    cudaCheckError();
 
     /* GMRES outer-loop */
     for( itr = 0; itr < MAX_ITR; ++itr ) {
         /* calculate r0 */
         //Sparse_MatVec( H, x, workspace->b_prm );      
-        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
+        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>>
+            ( *H, x, workspace->b_prm, N );
+        cudaThreadSynchronize();
+        cudaCheckError();
 
         GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
             (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
+        cudaThreadSynchronize();
+        cudaCheckError();
 
         Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
-            (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-        cudaThreadSynchronize ();
+            (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,
+             workspace->b_prc, -1., workspace->b_prm, N);
+        cudaThreadSynchronize();
         cudaCheckError ();
 
         //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
         {
-            cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+            cuda_memset( spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
 
             Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
                 (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            cudaThreadSynchronize();
+            cudaCheckError();
 
-            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+                (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
 
-            copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
+            copy_host_device( g, workspace->g, REAL_SIZE,
+                    cudaMemcpyDeviceToHost, RES_STORAGE_G);
         }
 
-        Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-            ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
+        Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>>
+            ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0],
+              &workspace->v[index_wkspace_sys(0,0,N)], N );
+        cudaThreadSynchronize();
+        cudaCheckError();
 
         /* GMRES inner-loop */
 #ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
+        fprintf( stderr,
+                " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm,
+                tol, g[0] );
 #endif
+
         for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
             /* matvec */
             //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-            Cuda_Matvec_csr 
-                <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> 
-                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            Cuda_Matvec_csr<<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> 
+                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)],
+                  &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
+            cudaThreadSynchronize();
+            cudaCheckError();
 
-            GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-                (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            GMRES_Diagonal_Preconditioner<<<BLOCKS, BLOCK_SIZE>>>
+                (&workspace->v[ index_wkspace_sys (j+1,0,N) ],
+                 &workspace->v[ index_wkspace_sys( j+1,0,N) ],
+                 workspace->Hdia_inv, N );
+            cudaThreadSynchronize();
+            cudaCheckError();
 
 
             /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i <= j; i++ ) {
+            for( i = 0; i <= j; i++ )
+            {
                 Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-                    (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
-                cudaThreadSynchronize ();
-                cudaCheckError ();
+                    (&workspace->v[index_wkspace_sys(i,0,N)],
+                     &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
+                cudaThreadSynchronize();
+                cudaCheckError();
 
-                Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
-                cudaThreadSynchronize ();
-                cudaCheckError ();
+                Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+                    (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
+                cudaThreadSynchronize();
+                cudaCheckError();
 
                 copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
 
-                Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
+                Cuda_Vector_Add<<< BLOCKS, BLOCK_SIZE >>>
                     ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
                       -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-                cudaThreadSynchronize ();
-                cudaCheckError ();
+                cudaThreadSynchronize();
+                cudaCheckError();
             }
 
 
             //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-            cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
+            cuda_memset(spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
 
-            Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            Cuda_Norm<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+                (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
 
-            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            Cuda_Norm<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+                (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
 
-            copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            copy_host_device(&v_add_tmp,
+                    &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
 
-            Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+            Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>>
                 ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
                   1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            cudaThreadSynchronize();
+            cudaCheckError();
 
             /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            GMRES_Givens_Rotation <<<1, 1>>>
+            GMRES_Givens_Rotation<<<1, 1>>>
                 (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            cudaThreadSynchronize();
+            cudaCheckError();
+            copy_host_device(&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
         }
 
-        copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+        copy_host_device(g, workspace->g, (RESTART+1)*REAL_SIZE,
+                cudaMemcpyHostToDevice, __LINE__);
 
         /* solve Hy = g.
            H is now upper-triangular, do back-substitution */
-        copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-        GMRES_BackSubstitution <<<1, 1>>>
+        copy_host_device(g, spad, (RESTART+1) * REAL_SIZE,
+                cudaMemcpyHostToDevice, RES_STORAGE_G);
+        GMRES_BackSubstitution<<<1, 1>>>
             (j, spad, workspace->h, workspace->y);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
+        cudaThreadSynchronize();
+        cudaCheckError();
 
         /* update x = x_0 + Vy */
         for( i = 0; i < j; i++ )
         {
-            copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            copy_host_device(&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
             Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
                 ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
             cudaThreadSynchronize ();
-            cudaCheckError ();
+            cudaCheckError();
         }
 
         /* stopping condition */
         if( fabs(g[j]) / bnorm <= tol )
+        {
             break;
+        }
     }
 
     if( itr >= MAX_ITR ) {
@@ -444,6 +466,7 @@ int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x )
 #ifdef __DEBUG_CUDA__
     fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
 #endif
+
     return itr * (RESTART+1) + j + 1;
 }
 
@@ -679,9 +702,11 @@ int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real t
 #ifdef __DEBUG_CUDA__
     fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
 #endif
+
     return itr * (RESTART+1) + j + 1;
 }
 
+
 int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, 
         real *b, real tol, real *x, FILE *fout, reax_system *system)
 {
@@ -874,7 +899,8 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol,
     bnorm = Norm( b, N );
 
     /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
+    for( itr = 0; itr < MAX_ITR; ++itr )
+    {
         /* calculate r0 */
         Sparse_MatVec( H, x, workspace->b_prm );      
         Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N );
@@ -885,14 +911,18 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol,
         //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
 
         /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
+        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
+        {
             /* matvec */
             Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] );
             Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
             Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
 
             /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0;
+            for( i = 0; i < j-1; i++ )
+            {
+                workspace->h[ index_wkspace_res (i,j)] = 0;
+            }
 
             //for( i = 0; i <= j; i++ ) {
             for( i = MAX(j-1,0); i <= j; i++ ) {
@@ -906,8 +936,10 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol,
             // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
 
             /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = MAX(j-1,0); i <= j; i++ )    {
-                if( i == j ) {
+            for( i = MAX(j-1,0); i <= j; i++ )
+            {
+                if( i == j )
+                {
                     cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
                     workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
                     workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
@@ -937,10 +969,13 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol,
 
 
         /* solve Hy = g: H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
+        for( i = j-1; i >= 0; i-- )
+        {
             temp = workspace->g[i];      
             for( k = j-1; k > i; k-- )
+            {
                 temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+            }
 
             workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
         }
@@ -955,184 +990,186 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol,
 
         /* stopping condition */
         if( fabs(workspace->g[j]) / bnorm <= tol )
+        {
             break;
         }
+    }
 
-        // Sparse_MatVec( H, x, workspace->b_prm );
-        // for( i = 0; i < N; ++i )
-        // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-        // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-        // for( i = 0; i < N; ++i )
-        // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-        // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-        // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-        //          itr, j, fabs( workspace->g[j] ) / bnorm );
-        // data->timing.matvec += itr * RESTART + j;
+    // Sparse_MatVec( H, x, workspace->b_prm );
+    // for( i = 0; i < N; ++i )
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+    // for( i = 0; i < N; ++i )
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
 
-        if( itr >= MAX_ITR ) {
-            fprintf( stderr, "GMRES convergence failed\n" );
-            // return -1;
-            return itr * (RESTART+1) + j + 1;
-        }
+    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+    //          itr, j, fabs( workspace->g[j] ) / bnorm );
+    // data->timing.matvec += itr * RESTART + j;
 
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        // return -1;
         return itr * (RESTART+1) + j + 1;
     }
 
+    return itr * (RESTART+1) + j + 1;
+}
 
 
-    int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
-            sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
-    {
-        int  i, N;
-        real tmp, alpha, beta, b_norm, r_norm;
-        real sig0, sig_old, sig_new;
-
-        N = A->n;
-        b_norm = Norm( b, N );
-        //fprintf( stderr, "b_norm: %.15e\n", b_norm );
 
-        Sparse_MatVec( A, x, workspace->q );
-        Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
+        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
+{
+    int  i, N;
+    real tmp, alpha, beta, b_norm, r_norm;
+    real sig0, sig_old, sig_new;
+
+    N = A->n;
+    b_norm = Norm( b, N );
+    //fprintf( stderr, "b_norm: %.15e\n", b_norm );
+
+    Sparse_MatVec( A, x, workspace->q );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+    r_norm = Norm(workspace->r, N);
+    //Print_Soln( workspace, x, q, b, N );
+    //fprintf( stderr, "res: %.15e\n", r_norm );
+
+    Forward_Subs( L, workspace->r, workspace->d );
+    Backward_Subs( U, workspace->d, workspace->p );
+    sig_new = Dot( workspace->r, workspace->p, N );
+    sig0 = sig_new;
+
+    for( i = 0; i < 200 && r_norm/b_norm > tol; ++i )
+    {
+        //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
+        Sparse_MatVec( A, workspace->p, workspace->q );
+        tmp = Dot( workspace->q, workspace->p, N );
+        alpha = sig_new / tmp;
+        Vector_Add( x, alpha, workspace->p, N );
+        //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
+        //     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
+
+        Vector_Add( workspace->r, -alpha, workspace->q, N );
         r_norm = Norm(workspace->r, N);
-        //Print_Soln( workspace, x, q, b, N );
         //fprintf( stderr, "res: %.15e\n", r_norm );
 
         Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->p );
-        sig_new = Dot( workspace->r, workspace->p, N );
-        sig0 = sig_new;
-
-        for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) {
-            //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
-            Sparse_MatVec( A, workspace->p, workspace->q );
-            tmp = Dot( workspace->q, workspace->p, N );
-            alpha = sig_new / tmp;
-            Vector_Add( x, alpha, workspace->p, N );
-            //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
-            //     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
-
-            Vector_Add( workspace->r, -alpha, workspace->q, N );
-            r_norm = Norm(workspace->r, N);
-            //fprintf( stderr, "res: %.15e\n", r_norm );
-
-            Forward_Subs( L, workspace->r, workspace->d );
-            Backward_Subs( U, workspace->d, workspace->d );
-            sig_old = sig_new;
-            sig_new = Dot( workspace->r, workspace->d, N );
-            beta = sig_new / sig_old;
-            Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
-        }
-
-        //fprintf( fout, "CG took %d iterations\n", i );
-        if( i >= 200 ) {
-            fprintf( stderr, "CG convergence failed!\n" );
-            return i;
-        }
+        Backward_Subs( U, workspace->d, workspace->d );
+        sig_old = sig_new;
+        sig_new = Dot( workspace->r, workspace->d, N );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
+    }
 
+    //fprintf( fout, "CG took %d iterations\n", i );
+    if( i >= 200 ) {
+        fprintf( stderr, "CG convergence failed!\n" );
         return i;
-        }
+    }
 
+    return i;
+}
 
-        int CG( static_storage *workspace, sparse_matrix *H, 
-                real *b, real tol, real *x, FILE *fout, reax_system *system)
-        {
-            int  i, j, N;
-            real tmp, alpha, beta, b_norm;
-            real sig_old, sig_new, sig0;
-
-            N = H->n;
-            b_norm = Norm( b, N );
-            //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
-
-            Sparse_MatVec( H, x, workspace->q );
-            Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-            for( j = 0; j < N; ++j )
-                workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-            sig_new = Dot( workspace->r, workspace->d, N );
-            sig0 = sig_new;
-            //Print_Soln( workspace, x, q, b, N );
-            //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
-            // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
-            //fprintf( stderr, "sig_new: %f\n", sig_new );
-
-            for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
-                //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
-                Sparse_MatVec( H, workspace->d, workspace->q );
-                tmp = Dot( workspace->d, workspace->q, N );
-                //fprintf( stderr, "tmp: %f\n", tmp );
-                alpha = sig_new / tmp;    
-                Vector_Add( x, alpha, workspace->d, N );
-                //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-                //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-
-                Vector_Add( workspace->r, -alpha, workspace->q, N );    
-                for( j = 0; j < N; ++j )
-                    workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-                sig_old = sig_new;
-                sig_new = Dot( workspace->r, workspace->p, N );
-                beta = sig_new / sig_old;
-                Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
-                //fprintf( stderr, "sig_new: %f\n", sig_new );
-            }
 
-            fprintf( stderr, "CG took %d iterations\n", i );
+int CG( static_storage *workspace, sparse_matrix *H, 
+        real *b, real tol, real *x, FILE *fout, reax_system *system)
+{
+    int  i, j, N;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new, sig0;
 
-            if( i >= 300 ) {
-                fprintf( stderr, "CG convergence failed!\n" );
-                return i;
-            }
+    N = H->n;
+    b_norm = Norm( b, N );
+    //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+
+    Sparse_MatVec( H, x, workspace->q );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+    for( j = 0; j < N; ++j )
+        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
+    sig_new = Dot( workspace->r, workspace->d, N );
+    sig0 = sig_new;
+    //Print_Soln( workspace, x, q, b, N );
+    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
+    // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
+    //fprintf( stderr, "sig_new: %f\n", sig_new );
+
+    for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
+        //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
+        Sparse_MatVec( H, workspace->d, workspace->q );
+        tmp = Dot( workspace->d, workspace->q, N );
+        //fprintf( stderr, "tmp: %f\n", tmp );
+        alpha = sig_new / tmp;    
+        Vector_Add( x, alpha, workspace->d, N );
+        //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
+        //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+
+        Vector_Add( workspace->r, -alpha, workspace->q, N );    
+        for( j = 0; j < N; ++j )
+            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
+        sig_old = sig_new;
+        sig_new = Dot( workspace->r, workspace->p, N );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
+        //fprintf( stderr, "sig_new: %f\n", sig_new );
+    }
 
-            return i;
-            }
+    fprintf( stderr, "CG took %d iterations\n", i );
+
+    if( i >= 300 ) {
+        fprintf( stderr, "CG convergence failed!\n" );
+        return i;
+    }
 
+    return i;
+}
 
 
-            /* Steepest Descent */
-            int SDM( static_storage *workspace, sparse_matrix *H, 
-                    real *b, real tol, real *x, FILE *fout )
-            {
-                int  i, j, N;
-                real tmp, alpha, beta, b_norm;
-                real sig0, sig;
 
-                N = H->n;
-                b_norm = Norm( b, N );
-                //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+/* Steepest Descent */
+int SDM( static_storage *workspace, sparse_matrix *H, 
+        real *b, real tol, real *x, FILE *fout )
+{
+    int  i, j, N;
+    real tmp, alpha, beta, b_norm;
+    real sig0, sig;
 
-                Sparse_MatVec( H, x, workspace->q );
-                Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-                for( j = 0; j < N; ++j )
-                    workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    N = H->n;
+    b_norm = Norm( b, N );
+    //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
 
-                sig = Dot( workspace->r, workspace->d, N );
-                sig0 = sig;
+    Sparse_MatVec( H, x, workspace->q );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+    for( j = 0; j < N; ++j )
+        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
 
-                for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
-                    Sparse_MatVec( H, workspace->d, workspace->q );
+    sig = Dot( workspace->r, workspace->d, N );
+    sig0 = sig;
 
-                    sig = Dot( workspace->r, workspace->d, N );
-                    tmp = Dot( workspace->d, workspace->q, N );
-                    alpha = sig / tmp;    
+    for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
+        Sparse_MatVec( H, workspace->d, workspace->q );
 
-                    Vector_Add( x, alpha, workspace->d, N );
-                    Vector_Add( workspace->r, -alpha, workspace->q, N );
-                    for( j = 0; j < N; ++j )
-                        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        sig = Dot( workspace->r, workspace->d, N );
+        tmp = Dot( workspace->d, workspace->q, N );
+        alpha = sig / tmp;    
 
-                    //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-                    //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-                }
+        Vector_Add( x, alpha, workspace->d, N );
+        Vector_Add( workspace->r, -alpha, workspace->q, N );
+        for( j = 0; j < N; ++j )
+            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
 
-                fprintf( stderr, "SDM took %d iterations\n", i );
+        //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
+        //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+    }
 
-                if( i >= 300 ) {
-                    fprintf( stderr, "SDM convergence failed!\n" );
-                    return i;
-                }
+    fprintf( stderr, "SDM took %d iterations\n", i );
 
-                return i;
-            }
+    if( i >= 300 ) {
+        fprintf( stderr, "SDM convergence failed!\n" );
+        return i;
+    }
 
+    return i;
+}
diff --git a/PuReMD-GPU/src/QEq.cu b/PuReMD-GPU/src/QEq.cu
index 5d849b26..06f708cc 100644
--- a/PuReMD-GPU/src/QEq.cu
+++ b/PuReMD-GPU/src/QEq.cu
@@ -34,6 +34,7 @@
 
 #include "system_props.h"
 
+
 HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) 
 {
     sparse_matrix_entry temp = array[index1];
@@ -41,6 +42,7 @@ HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2)
     array[index2] = temp;
 }
 
+
 HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
 {
     int i = start;
@@ -62,6 +64,7 @@ HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
     }  
 }
 
+
 int compare_matrix_entry(const void *v1, const void *v2)
 {
     return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
@@ -80,6 +83,7 @@ void Sort_Matrix_Rows( sparse_matrix *A )
     }
 }
 
+
 GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A )
 {
     int i;
@@ -129,10 +133,11 @@ void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
     //fprintf( stderr, "\n" );
 }
 
+
 GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    int k, j, offset, x, diagnol;
+    int k, j, offset, x, diagonal;
     real val;
     sparse_matrix *A = &p_A;
 
@@ -152,10 +157,11 @@ GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol
 
 }
 
+
 GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    int k, j, offset, x, diagnol;
+    int k, j, offset, x, diagonal;
     real val;
     sparse_matrix *A = &p_A;
 
@@ -171,17 +177,18 @@ GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real d
     }
 }
 
-GLOBAL void Cuda_Calculate_Droptol_diagnol ( sparse_matrix p_A, real *droptol, real dtol )
+
+GLOBAL void Cuda_Calculate_Droptol_diagonal ( sparse_matrix p_A, real *droptol, real dtol )
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    int k, j, offset, x, diagnol;
+    int k, j, offset, x, diagonal;
     real val;
     sparse_matrix *A = &p_A;
 
     if ( i < A->n ) {
-        //diagnol element
-        diagnol = A->end[i]-1;
-        val = A->entries [diagnol].val;
+        //diagonal element
+        diagonal = A->end[i]-1;
+        val = A->entries [diagonal].val;
         droptol [i] += val*val;
     }
 
@@ -213,6 +220,7 @@ int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
     return fillin + A->n;
 }
 
+
 GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin)
 {
     int i, j, pj;
@@ -233,6 +241,7 @@ GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *filli
     }
 }
 
+
 void ICHOLT( sparse_matrix *A, real *droptol, 
         sparse_matrix *L, sparse_matrix *U )
 {
@@ -338,7 +347,6 @@ void ICHOLT( sparse_matrix *A, real *droptol,
 }
 
 
-
 void Cuda_ICHOLT( sparse_matrix *A, real *droptol, 
         sparse_matrix *L, sparse_matrix *U )
 {
@@ -489,7 +497,7 @@ end = A->end[i]-1; //inclusive
 count = end - start; //inclusive
 tmp_val [kid] = 0;
 
-if (kid < count) //diagnol not included
+if (kid < count) //diagonal not included
 {
 pj = start + kid;
 
@@ -540,7 +548,7 @@ if (kid == 0)
     }
 }
 
-//diagnol element
+//diagonal element
 //val = A->entries[pj].val;
 //for( k1 = 0; k1 < tmptop; ++k1 )
 if (kid < count) 
@@ -612,7 +620,8 @@ void Init_MatVec( reax_system *system, control_params *control,
     //char fname[100];
 
     if(control->refactor > 0 && 
-            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){
+            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL))
+    {
         //Print_Linear_System( system, control, workspace, data->step );
         Sort_Matrix_Rows( &workspace->H );
 
@@ -621,17 +630,21 @@ void Init_MatVec( reax_system *system, control_params *control,
         Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
         //fprintf( stderr, "drop tolerances calculated\n" );
 
-
-        if( workspace->L.entries == NULL ) {
+        if( workspace->L.entries == NULL )
+        {
             fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
+
 #ifdef __DEBUG_CUDA__
             fprintf( stderr, "fillin = %d\n", fillin );
 #endif
+
             if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
-                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){
+                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 )
+            {
                 fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
                 exit(INSUFFICIENT_SPACE);
             }
+
 #if defined(DEBUG_FOCUS)
             fprintf( stderr, "fillin = %d\n", fillin );
             fprintf( stderr, "allocated memory: L = U = %ldMB\n",
@@ -689,43 +702,44 @@ void Init_MatVec( reax_system *system, control_params *control,
     }
 }
 
-void Cuda_Init_MatVec(     reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list *far_nbrs )
+
+void Cuda_Init_MatVec(reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list *far_nbrs )
 {
     int i, fillin;
     real s_tmp, t_tmp;
     int *spad = (int *)scratch;
     real start = 0, end = 0;
 
-    if(control->refactor > 0 && 
-            ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){
-
-        Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
+    if( control->refactor > 0 && 
+            ((data->step-data->prev_steps)%control->refactor==0 ||
+             dev_workspace->L.entries==NULL) )
+    {
+        Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>>
             ( dev_workspace->H );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
+        cudaThreadSynchronize();
+        cudaCheckError();
 
 #ifdef __DEBUG_CUDA__
         fprintf (stderr, "Sorting done... \n");
 #endif
 
-        Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>>
+        Cuda_Calculate_Droptol<<<BLOCKS, BLOCK_SIZE >>>
             ( dev_workspace->H, dev_workspace->droptol, control->droptol );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
+        cudaThreadSynchronize();
+        cudaCheckError();
 
 #ifdef __DEBUG_CUDA__
         fprintf (stderr, "Droptol done... \n");
 #endif
 
-        if( dev_workspace->L.entries == NULL ) {
-
+        if( dev_workspace->L.entries == NULL )
+        {
             cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
             Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>>
                 ( dev_workspace->H, dev_workspace->droptol, spad );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
+            cudaThreadSynchronize();
+            cudaCheckError();
 
             //Reduction for fill in 
             Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
@@ -789,16 +803,36 @@ void Cuda_Init_MatVec(     reax_system *system, control_params *control,
 #ifdef __DEBUG_CUDA__
         fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
 #endif
+
         start = Get_Time ();
-        if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);}
-        if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); }
-        if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); }
+        if(!Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m))
+        {
+            fprintf(stderr, "No space for H matrix \n");
+            exit(0);
+        }
+        if(!Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m))
+        {
+            fprintf(stderr, "No space for L matrix \n");
+            exit(0);
+        }
+        if(!Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m))
+        {
+            fprintf(stderr, "No space for U matrix \n");
+            exit(0);
+        }
 
-        copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
-        copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
-        copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
+        copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE *
+                (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost,
+                RES_SPARSE_MATRIX_INDEX );
+        copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE *
+                (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost,
+                RES_SPARSE_MATRIX_INDEX );
+        copy_host_device ( t_H.entries, dev_workspace->H.entries,
+                SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m,
+                cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
 
-        copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
+        copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE *
+                system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
 
         //fprintf (stderr, " Done copying LUH .. \n");
         Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U);
@@ -828,6 +862,7 @@ void Cuda_Init_MatVec(     reax_system *system, control_params *control,
     }
 }
 
+
 GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N )
 {
 
@@ -873,6 +908,7 @@ GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N )
     workspace->t[index_wkspace_sys(0,i,N)] = t_tmp;
 }
 
+
 void Calculate_Charges( reax_system *system, static_storage *workspace )
 {
     int i;
@@ -891,17 +927,25 @@ void Calculate_Charges( reax_system *system, static_storage *workspace )
 #endif
 
     for( i = 0; i < system->N; ++i )
+    {
         system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)];
+    }
 }
 
+
 GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N)
 {
     int i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i >= N) return;
+
+    if (i >= N)
+    {
+        return;
+    }
 
     atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)];
 }
 
+
 void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace)
 {
     real *spad = (real *) scratch;
@@ -1018,6 +1062,7 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     //Print_Charges( system, control, workspace, data->step );
 }
 
+
 void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, 
         static_storage *workspace, list *far_nbrs, 
         output_controls *out_control )
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 835ffa35..d08dfe1e 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -36,7 +36,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
     real H;
 
     for ( i = 0; i < N; ++i )
+    {
         b[i][0] = b[i][1] = 0;
+    }
 
     /* perform multiplication */
     for ( i = 0; i < A->n; ++i )
@@ -64,7 +66,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 
 
 int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-             rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
+        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, n, N, matvecs, scale;
     rvec2 tmp, alpha, beta;
@@ -86,13 +88,17 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         t_start = Get_Time( );
     }
 #endif
+
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
     dual_Sparse_MatVec( H, x, workspace->q2, N );
     // tryQEq
     Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &matvec_time );
+    }
 #endif
 
     for ( j = 0; j < system->n; ++j )
@@ -126,6 +132,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     }
     MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
     //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
         Update_Timing_Info( &t_start, &dot_time );
@@ -137,9 +144,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
         // tryQEq
         Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
 #endif
 
         /* dot product: d.q */
@@ -174,12 +184,18 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         sig_old[1] = sig_new[1];
         MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
         //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
 #endif
+
         if ( sqrt(sig_new[0]) / b_norm[0] <= tol || sqrt(sig_new[1]) / b_norm[1] <= tol )
+        {
             break;
+        }
 
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
@@ -194,30 +210,41 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     if ( sqrt(sig_new[0]) / b_norm[0] <= tol )
     {
         for ( j = 0; j < n; ++j )
+        {
             workspace->t[j] = workspace->x[j][1];
-        matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
-                      mpi_data, fout );
+        }
+        matvecs = CG( system, workspace, H, workspace->b_t, tol,
+                workspace->t,mpi_data, fout );
         for ( j = 0; j < n; ++j )
+        {
             workspace->x[j][1] = workspace->t[j];
+        }
     }
     else if ( sqrt(sig_new[1]) / b_norm[1] <= tol )
     {
         for ( j = 0; j < n; ++j )
+        {
             workspace->s[j] = workspace->x[j][0];
+        }
         matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
-                      mpi_data, fout );
+                mpi_data, fout );
         for ( j = 0; j < system->n; ++j )
+        {
             workspace->x[j][0] = workspace->s[j];
+        }
     }
 
-
     if ( i >= 300 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
+    }
 
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
-        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
-                 i + 1, matvecs, matvec_time, dot_time );
+    {
+        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n", i + 1,
+                matvecs, matvec_time, dot_time );
+    }
 #endif
 
     return (i + 1) + matvecs;
@@ -230,7 +257,9 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
     real H;
 
     for ( i = 0; i < N; ++i )
+    {
         b[i] = 0;
+    }
 
     /* perform multiplication */
     for ( i = 0; i < A->n; ++i )
@@ -249,8 +278,8 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 }
 
 
-int CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;
@@ -269,21 +298,29 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
     Sparse_MatVec( H, x, workspace->q, system->N );
     // tryQEq
     Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &matvec_time );
+    }
 #endif
 
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
     for ( j = 0; j < system->n; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    }
 
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
     sig0 = sig_new;
+
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &dot_time );
+    }
 #endif
 
     for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i )
@@ -292,9 +329,12 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
         Sparse_MatVec( H, workspace->d, workspace->q, system->N );
         //tryQEq
         Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
 #endif
 
         tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
@@ -303,15 +343,20 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
         /* pre-conditioning */
         for ( j = 0; j < system->n; ++j )
+        {
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
 
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
         beta = sig_new / sig_old;
         Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
+
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
 #endif
     }
 
@@ -323,8 +368,10 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
-        fprintf( fout, "QEq %d iters. matvecs: %f  dot: %f\n",
-                 i, matvec_time, dot_time );
+    {
+        fprintf( fout, "QEq %d iters. matvecs: %f  dot: %f\n", i, matvec_time,
+                dot_time );
+    }
 #endif
 
     return i;
@@ -332,7 +379,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
-             real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;
-- 
GitLab