From 21aca7ac2c19d68959748bac2e6343ef62dfaef2 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu>
Date: Sat, 25 Feb 2017 18:34:26 -0500
Subject: [PATCH] Complete ACKS2 implementation. Fix ILU_PAR/ILUT_PAR diagonal
 scalings bugs (absolute value).

---
 sPuReMD/src/allocate.c    |   4 +-
 sPuReMD/src/charges.c     | 426 +++++++++++++++++++++++++++++++++-----
 sPuReMD/src/control.c     |   5 +-
 sPuReMD/src/ffield.c      |   1 +
 sPuReMD/src/forces.c      | 158 +++++++++++++-
 sPuReMD/src/init_md.c     |  41 +++-
 sPuReMD/src/mytypes.h     |   3 +-
 sPuReMD/src/print_utils.c |   2 +-
 sPuReMD/src/testmd.c      |   7 +-
 9 files changed, 581 insertions(+), 66 deletions(-)

diff --git a/sPuReMD/src/allocate.c b/sPuReMD/src/allocate.c
index ed2c07a4..ab075f94 100644
--- a/sPuReMD/src/allocate.c
+++ b/sPuReMD/src/allocate.c
@@ -87,8 +87,8 @@ int Allocate_Matrix( sparse_matrix **pH, int n, int m )
     H->n = n;
     H->m = m;
 
-    if ( (H->start = (unsigned int*) malloc(sizeof(int) * (n + 1))) == NULL
-            || (H->j = (unsigned int*) malloc(sizeof(int) * m)) == NULL
+    if ( (H->start = (unsigned int*) malloc(sizeof(unsigned int) * (n + 1))) == NULL
+            || (H->j = (unsigned int*) malloc(sizeof(unsigned int) * m)) == NULL
             || (H->val = (real*) malloc(sizeof(real) * m)) == NULL )
     {
         return FAILURE;
diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 2f6c5710..ecd2827e 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -582,12 +582,18 @@ static real diag_pre_comp( const reax_system * const system, real * const Hdia_i
     start = Get_Time( );
 
     #pragma omp parallel for schedule(static) \
-    default(none) private(i)
+        default(none) private(i)
     for ( i = 0; i < system->N; ++i )
     {
         Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta;
     }
-    Hdia_inv[system->N_cm - 1] = 1.0;
+
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i)
+    for ( i = system->N; i < system->N_cm; ++i )
+    {
+        Hdia_inv[i] = 1.0;
+    }
 
     return Get_Timing_Info( start );
 }
@@ -595,21 +601,21 @@ static real diag_pre_comp( const reax_system * const system, real * const Hdia_i
 
 /* Incomplete Cholesky factorization with dual thresholding */
 static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
-                    sparse_matrix * const L, sparse_matrix * const U )
+        sparse_matrix * const L, sparse_matrix * const U )
 {
     int *tmp_j;
     real *tmp_val;
     int i, j, pj, k1, k2, tmptop, Ltop;
     real val, start;
-    int *Utop;
+    unsigned int *Utop;
 
     start = Get_Time( );
 
-    if ( ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ||
+    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
             ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
             ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
     {
-        fprintf( stderr, "not enough memory for ICHOLT preconditioning matrices. terminating.\n" );
+        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
         exit( INSUFFICIENT_MEMORY );
     }
 
@@ -620,9 +626,6 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
     memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
     memset( Utop, 0, A->n * sizeof(unsigned int) );
 
-//    fprintf( stderr, "n: %d\n", A->n );
-//    fflush( stderr );
-
     for ( i = 0; i < A->n; ++i )
     {
         L->start[i] = Ltop;
@@ -633,9 +636,6 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
             j = A->j[pj];
             val = A->val[pj];
 
-//            fprintf( stderr, "  i: %d, j: %d\n", i, j );
-//            fflush( stderr );
-
             if ( FABS(val) > droptol[i] )
             {
                 k1 = 0;
@@ -664,15 +664,12 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
                 tmp_val[tmptop] = val;
                 ++tmptop;
             }
-
-//            fprintf( stderr, " -- done\n" );
-//            fflush( stderr );
         }
 
         // sanity check
         if ( A->j[pj] != i )
         {
-            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
+            fprintf( stderr, "[ICHOLT] badly built A matrix!\n (i = %d) ", i );
             exit( NUMERIC_BREAKDOWN );
         }
 
@@ -683,8 +680,17 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
             val -= (tmp_val[k1] * tmp_val[k1]);
         }
 
+#if defined(DEBUG)
+        if ( val < 0.0 )
+        {
+            fprintf( stderr, "[ICHOLT] Numeric breakdown (SQRT of negative on diagonal i = %d). Terminating.\n", i );
+            exit( NUMERIC_BREAKDOWN );
+
+        }
+#endif
+
         tmp_j[tmptop] = i;
-        tmp_val[tmptop] = SQRT(val);
+        tmp_val[tmptop] = SQRT( val );
 
         // apply the dropping rule once again
         //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
@@ -854,7 +860,7 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
                 /* sanity check */
                 if ( sum < ZERO )
                 {
-                    fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n");
+                    fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n");
 #if defined(DEBUG_FOCUS)
                     fprintf( stderr, "A(%5d,%5d) = %10.3f\n",
                              k - 1, A->entries[j].j, A->entries[j].val );
@@ -929,7 +935,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
             ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
             ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
     {
-        fprintf( stderr, "not enough memory for ILU_PAR preconditioning matrices. terminating.\n" );
+        fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
         exit( INSUFFICIENT_MEMORY );
     }
 
@@ -937,12 +943,13 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     default(none) shared(D, D_inv) private(i)
     for ( i = 0; i < A->n; ++i )
     {
-        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
         D[i] = 1.0 / D_inv[i];
+//        printf( "A->val[%8d] = %f, D[%4d] = %f, D_inv[%4d] = %f\n", A->start[i + 1] - 1, A->val[A->start[i + 1] - 1], i, D[i], i, D_inv[i] );
     }
 
     /* to get convergence, A must have unit diagonal, so apply
-     * transformation DAD, where D = D(1./sqrt(D(A))) */
+     * transformation DAD, where D = D(1./sqrt(abs(D(A)))) */
     memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
     #pragma omp parallel for schedule(static) \
     default(none) shared(DAD, D) private(i, pj)
@@ -1146,7 +1153,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
     default(none) shared(D, D_inv) private(i)
     for ( i = 0; i < A->n; ++i )
     {
-        D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] );
+        D_inv[i] = SQRT( FABS( A->val[A->start[i + 1] - 1] ) );
         D[i] = 1.0 / D_inv[i];
     }
 
@@ -1461,7 +1468,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 {
     sparse_matrix *Hptr;
 
-    if ( control->cm_domain_sparsify_enabled )
+    if ( control->cm_domain_sparsify_enabled == TRUE )
     {
         Hptr = workspace->H_sp;
     }
@@ -1611,9 +1618,9 @@ static void Compute_Preconditioner_EEM( const reax_system * const system,
         switch ( control->cm_solver_pre_app_type )
         {
             case TRI_SOLVE_PA:
-                tri_solve( workspace->L_EEM, ones, x, system->N, LOWER );
+                tri_solve( workspace->L_EEM, ones, x, workspace->L_EEM->n, LOWER );
                 Transpose_I( workspace->U_EEM );
-                tri_solve( workspace->U_EEM, ones, y, system->N, LOWER );
+                tri_solve( workspace->U_EEM, ones, y, workspace->U_EEM->n, LOWER );
                 Transpose_I( workspace->U_EEM );
 
                 memcpy( workspace->L->start, workspace->L_EEM->start, sizeof(unsigned int) * (system->N + 1) );
@@ -1659,9 +1666,9 @@ static void Compute_Preconditioner_EEM( const reax_system * const system,
                 break;
 
             case TRI_SOLVE_LEVEL_SCHED_PA:
-                tri_solve_level_sched( workspace->L_EEM, ones, x, system->N, LOWER, TRUE );
+                tri_solve_level_sched( workspace->L_EEM, ones, x, workspace->L_EEM->n, LOWER, TRUE );
                 Transpose_I( workspace->U_EEM );
-                tri_solve_level_sched( workspace->U_EEM, ones, y, system->N, LOWER, TRUE );
+                tri_solve_level_sched( workspace->U_EEM, ones, y, workspace->U_EEM->n, LOWER, TRUE );
                 Transpose_I( workspace->U_EEM );
 
                 memcpy( workspace->L->start, workspace->L_EEM->start, sizeof(unsigned int) * (system->N + 1) );
@@ -1708,9 +1715,9 @@ static void Compute_Preconditioner_EEM( const reax_system * const system,
 
             //TODO: add Jacobi iter, etc.?
             default:
-                tri_solve( workspace->L_EEM, ones, x, system->N, LOWER );
+                tri_solve( workspace->L_EEM, ones, x, workspace->L_EEM->n, LOWER );
                 Transpose_I( workspace->U_EEM );
-                tri_solve( workspace->U_EEM, ones, y, system->N, LOWER );
+                tri_solve( workspace->U_EEM, ones, y, workspace->U_EEM->n, LOWER );
                 Transpose_I( workspace->U_EEM );
 
                 memcpy( workspace->L->start, workspace->L_EEM->start, sizeof(unsigned int) * (system->N + 1) );
@@ -1778,6 +1785,99 @@ static void Compute_Preconditioner_EEM( const reax_system * const system,
 }
 
 
+/* Compute preconditioner for ACKS2
+ */
+static void Compute_Preconditioner_ACKS2( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+//    fprintf( stderr, "(%4d, %4d): %f\n", system->N, Hptr->j[Hptr->start[system->N + 1] - 1], Hptr->val[Hptr->start[system->N + 1] - 1] );
+//    fprintf( stderr, "(%4d, %4d): %f\n", system->N_cm - 1, Hptr->j[Hptr->start[system->N_cm] - 1], Hptr->val[Hptr->start[system->N_cm] - 1] );
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+//    fprintf( stderr, "(%4d, %4d): %f\n", system->N, Hptr->j[Hptr->start[system->N + 1] - 1], Hptr->val[Hptr->start[system->N + 1] - 1] );
+//    fprintf( stderr, "(%4d, %4d): %f\n", system->N_cm - 1, Hptr->j[Hptr->start[system->N_cm] - 1], Hptr->val[Hptr->start[system->N_cm] - 1] );
+//    fflush( stderr );
+    
+    switch ( control->cm_solver_pre_comp_type )
+    {
+    case DIAG_PC:
+        data->timing.cm_solver_pre_comp +=
+            diag_pre_comp( system, workspace->Hdia_inv );
+        break;
+
+    case ICHOLT_PC:
+        data->timing.cm_solver_pre_comp +=
+            ICHOLT( Hptr, workspace->droptol, workspace->L, workspace->U );
+        break;
+
+    case ILU_PAR_PC:
+        data->timing.cm_solver_pre_comp +=
+            ILU_PAR( Hptr, control->cm_solver_pre_comp_sweeps, workspace->L, workspace->U );
+        break;
+
+    case ILUT_PAR_PC:
+        data->timing.cm_solver_pre_comp +=
+            ILUT_PAR( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
+                    workspace->L, workspace->U );
+        break;
+
+    case ILU_SUPERLU_MT_PC:
+#if defined(HAVE_SUPERLU_MT)
+        data->timing.cm_solver_pre_comp +=
+            SuperLU_Factorize( Hptr, workspace->L, workspace->U );
+#else
+        fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" );
+        exit( INVALID_INPUT );
+#endif
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+
+#if defined(DEBUG)
+    if ( control->cm_solver_pre_comp_type != DIAG_PC )
+    {
+        fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) );
+
+#if defined(DEBUG_FOCUS)
+        sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->L, fname );
+        sprintf( fname, "%s.U%d.out", control->sim_name, data->step );
+        Print_Sparse_Matrix2( workspace->U, fname );
+
+        fprintf( stderr, "icholt-" );
+        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        //Print_Sparse_Matrix2( workspace->L, fname );
+        //Print_Sparse_Matrix( U );
+#endif
+    }
+#endif
+}
+
+
 /* Setup routines before computing the preconditioner for QEq
  */
 static void Setup_Preconditioner_QEq( const reax_system * const system,
@@ -1786,10 +1886,10 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
         const list * const far_nbrs )
 {
     int i, fillin;
-    real s_tmp, t_tmp, time;
+    real time;
     sparse_matrix *Hptr;
 
-    if (control->cm_domain_sparsify_enabled)
+    if ( control->cm_domain_sparsify_enabled == TRUE )
     {
         Hptr = workspace->H_sp;
     }
@@ -1830,7 +1930,7 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
     case DIAG_PC:
         if ( workspace->Hdia_inv == NULL )
         {
-            if ( ( workspace->Hdia_inv = (real *) calloc( system->N_cm, sizeof( real ) ) ) == NULL )
+            if ( ( workspace->Hdia_inv = (real *) calloc( Hptr->n, sizeof( real ) ) ) == NULL )
             {
                 fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
                 exit( INSUFFICIENT_MEMORY );
@@ -1850,13 +1950,13 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
 #if defined(DEBUG)
         fprintf( stderr, "fillin = %d\n", fillin );
         fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                 fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) );
+                 fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
 #endif
 
         if ( workspace->L == NULL )
         {
-            if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == FAILURE ||
-                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == FAILURE )
+            if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE ||
+                    Allocate_Matrix( &(workspace->U), Hptr->n, fillin ) == FAILURE )
             {
                 fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
                 exit( INSUFFICIENT_MEMORY );
@@ -1942,10 +2042,10 @@ static void Setup_Preconditioner_EEM( const reax_system * const system,
         const list * const far_nbrs )
 {
     int i, fillin;
-    real s_tmp, t_tmp, time;
+    real time;
     sparse_matrix *Hptr;
 
-    if (control->cm_domain_sparsify_enabled)
+    if ( control->cm_domain_sparsify_enabled == TRUE )
     {
         Hptr = workspace->H_sp;
     }
@@ -2025,7 +2125,7 @@ static void Setup_Preconditioner_EEM( const reax_system * const system,
 #if defined(DEBUG)
         fprintf( stderr, "fillin = %d\n", fillin );
         fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                 fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) );
+                 fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
 #endif
 
         if ( workspace->L == NULL )
@@ -2117,6 +2217,167 @@ static void Setup_Preconditioner_EEM( const reax_system * const system,
 }
 
 
+/* Setup routines before computing the preconditioner for ACKS2
+ */
+static void Setup_Preconditioner_ACKS2( const reax_system * const system,
+        const control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs )
+{
+    int i, fillin;
+    real time;
+    sparse_matrix *Hptr;
+
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Hptr = workspace->H_sp;
+    }
+    else
+    {
+        Hptr = workspace->H;
+    }
+
+    /* sort H needed for SpMV's in linear solver, H or H_sp needed for preconditioning */
+    time = Get_Time( );
+    Sort_Matrix_Rows( workspace->H );
+    if ( control->cm_domain_sparsify_enabled == TRUE )
+    {
+        Sort_Matrix_Rows( workspace->H_sp );
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = setup_graph_coloring( workspace->H_sp );
+        }
+        else
+        {
+            Hptr = setup_graph_coloring( workspace->H );
+        }
+
+        Sort_Matrix_Rows( Hptr );
+    }
+    data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
+
+#if defined(DEBUG)
+    fprintf( stderr, "H matrix sorted\n" );
+#endif
+
+    switch ( control->cm_solver_pre_comp_type )
+    {
+    case DIAG_PC:
+        if ( workspace->Hdia_inv == NULL )
+        {
+            if ( ( workspace->Hdia_inv = (real *) calloc( Hptr->n, sizeof( real ) ) ) == NULL )
+            {
+                fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        break;
+
+    case ICHOLT_PC:
+        Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+        fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
+
+#if defined(DEBUG)
+        fprintf( stderr, "fillin = %d\n", fillin );
+        fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                 fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
+#endif
+
+        if ( workspace->L == NULL )
+        {
+            if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE ||
+                    Allocate_Matrix( &(workspace->U), Hptr->n, fillin ) == FAILURE )
+            {
+                fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        else
+        {
+            //TODO: reallocate
+        }
+        break;
+
+    case ILU_PAR_PC:
+        if ( workspace->L == NULL )
+        {
+            /* factors have sparsity pattern as H */
+            if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                    Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+            {
+                fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        else
+        {
+            //TODO: reallocate
+        }
+        break;
+
+    case ILUT_PAR_PC:
+        Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "drop tolerances calculated\n" );
+#endif
+
+        if ( workspace->L == NULL )
+        {
+            /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
+            if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                    Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+            {
+                fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        else
+        {
+            //TODO: reallocate
+        }
+        break;
+
+    case ILU_SUPERLU_MT_PC:
+        if ( workspace->L == NULL )
+        {
+            /* factors have sparsity pattern as H */
+            if ( Allocate_Matrix( &(workspace->L), Hptr->n, Hptr->m ) == FAILURE ||
+                    Allocate_Matrix( &(workspace->U), Hptr->n, Hptr->m ) == FAILURE )
+            {
+                fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+        }
+        else
+        {
+            //TODO: reallocate
+        }
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+}
+
+
 /* Combine ficticious charges s and t to get atomic charge q for QEq method
  */
 static void Calculate_Charges_QEq( const reax_system * const system,
@@ -2147,7 +2408,7 @@ static void Calculate_Charges_EEM( const reax_system * const system,
 {
     int i;
 
-    for ( i = 0; i < system->N_cm; ++i )
+    for ( i = 0; i < system->N; ++i )
     {
         system->atoms[i].q = workspace->s[0][i];
     }
@@ -2280,13 +2541,6 @@ static void EEM( reax_system * const system, control_params * const control,
     }
 
 //   Print_Linear_System( system, control, workspace, data->step );
-//    Print_Sparse_Matrix2( workspace->H, "H_0.out" );
-//    Print_Sparse_Matrix2( workspace->L, "L_0.out" );
-//    Print_Sparse_Matrix2( workspace->U, "U_0.out" );
-//    Print_Sparse_Matrix2( workspace->H_EEM, "H_EEM_0.out" );
-//    Print_Sparse_Matrix2( workspace->L_EEM, "L_EEM_0.out" );
-//    Print_Sparse_Matrix2( workspace->U_EEM, "U_EEM_0.out" );
-//    exit(0);
 
     Extrapolate_Charges_EEM( system, control, data, workspace );
 
@@ -2335,10 +2589,88 @@ static void EEM( reax_system * const system, control_params * const control,
 }
 
 
+/* Main driver method for ACKS2 kernel
+ *
+ * Rough outline:
+ *  1) init / setup routines for preconditioning of linear solver
+ *  2) compute preconditioner
+ *  3) extrapolate charges
+ *  4) perform 1 linear solve
+ *  5) compute atomic charges based on output of (4)
+ */
+static void ACKS2( reax_system * const system, control_params * const control,
+        simulation_data * const data, static_storage * const workspace,
+        const list * const far_nbrs, const output_controls * const out_control )
+{
+    int iters;
+
+    if ( control->cm_solver_pre_comp_refactor > 0 &&
+            ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) )
+    {
+        Setup_Preconditioner_ACKS2( system, control, data, workspace, far_nbrs );
+
+        Compute_Preconditioner_ACKS2( system, control, data, workspace, far_nbrs );
+    }
+
+//   Print_Linear_System( system, control, workspace, data->step );
+//    Print_Sparse_Matrix2( workspace->H, "H_0.out" );
+//    Print_Sparse_Matrix2( workspace->L, "L_0.out" );
+//    Print_Sparse_Matrix2( workspace->U, "U_0.out" );
+//    FILE * fp = fopen( "b.out", "w" );
+//    Vector_Print( fp, "b.out", workspace->b_s, system->N_cm );
+//    fclose( fp );
+//    exit(0);
+
+    Extrapolate_Charges_EEM( system, control, data, workspace );
+
+    switch ( control->cm_solver_type )
+    {
+    case GMRES_S:
+        iters = GMRES( workspace, control, data, workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                out_control->log,
+                ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE );
+        break;
+
+    case GMRES_H_S:
+        iters = GMRES_HouseHolder( workspace, control, data,workspace->H,
+                workspace->b_s, control->cm_solver_q_err, workspace->s[0],
+                out_control->log,
+                (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 );
+        break;
+
+    case CG_S:
+        iters = CG( workspace, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], out_control->log ) + 1;
+        break;
+
+    case SDM_S:
+        iters = SDM( workspace, workspace->H, workspace->b_s, control->cm_solver_q_err,
+                workspace->s[0], out_control->log ) + 1;
+        break;
+
+    default:
+        fprintf( stderr, "Unrecognized ACKS2 solver selection. Terminating...\n" );
+        exit( INVALID_INPUT );
+        break;
+    }
+
+    data->timing.cm_solver_iters += iters;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "linsolve-" );
+#endif
+
+    Calculate_Charges_EEM( system, workspace );
+}
+
+
 void Compute_Charges( reax_system * const system, control_params * const control,
                       simulation_data * const data, static_storage * const workspace,
                       const list * const far_nbrs, const output_controls * const out_control )
 {
+    int i;
+
     switch ( control->charge_method )
     {
     case QEQ_CM:
@@ -2350,7 +2682,7 @@ void Compute_Charges( reax_system * const system, control_params * const control
         break;
 
     case ACKS2_CM:
-        //TODO
+        ACKS2( system, control, data, workspace, far_nbrs, out_control );
         break;
 
     default:
diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c
index e9470449..7f0134f3 100644
--- a/sPuReMD/src/control.c
+++ b/sPuReMD/src/control.c
@@ -266,7 +266,10 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control,
         {
             val = atof( tmp[1] );
             control->cm_domain_sparsity = val;
-            control->cm_domain_sparsify_enabled = TRUE;
+            if ( val < 1.0 )
+            {
+                control->cm_domain_sparsify_enabled = TRUE;
+            }
         }
         else if ( strcmp(tmp[0], "cm_solver_pre_comp_type") == 0 )
         {
diff --git a/sPuReMD/src/ffield.c b/sPuReMD/src/ffield.c
index 088e1acb..0fa59d4e 100644
--- a/sPuReMD/src/ffield.c
+++ b/sPuReMD/src/ffield.c
@@ -206,6 +206,7 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
         val = atof(tmp[5]);
         reax->sbp[i].b_o_133    = val;
         val = atof(tmp[6]);
+        reax->sbp[i].b_s_acks2  = val;
         val = atof(tmp[7]);
 
         /* line 4  */
diff --git a/sPuReMD/src/forces.c b/sPuReMD/src/forces.c
index 5e602253..e6c82684 100644
--- a/sPuReMD/src/forces.c
+++ b/sPuReMD/src/forces.c
@@ -424,13 +424,33 @@ static inline real Init_Charge_Matrix_Entry( reax_system *system,
         break;
 
     case ACKS2_CM:
-        //TODO
         switch ( pos )
         {
             case OFF_DIAGONAL:
+                if ( r_ij < control->r_cut && r_ij > 0.001 )
+                {
+                    Tap = control->Tap7 * r_ij + control->Tap6;
+                    Tap = Tap * r_ij + control->Tap5;
+                    Tap = Tap * r_ij + control->Tap4;
+                    Tap = Tap * r_ij + control->Tap3;
+                    Tap = Tap * r_ij + control->Tap2;
+                    Tap = Tap * r_ij + control->Tap1;
+                    Tap = Tap * r_ij + control->Tap0;
+
+                    gamij = SQRT( system->reaxprm.sbp[system->atoms[i].type].gamma
+                            * system->reaxprm.sbp[system->atoms[j].type].gamma );
+                    /* shielding */
+                    dr3gamij_1 = POW( r_ij, 3.0 ) + 1.0 / POW( gamij, 3.0 );
+                    dr3gamij_3 = POW( dr3gamij_1 , 1.0 / 3.0 );
+
+                    ret = Tap * EV_to_KCALpMOL / dr3gamij_3;
+                }
             break;
+
             case DIAGONAL:
+                ret = system->reaxprm.sbp[system->atoms[i].type].eta;
             break;
+
             default:
                 fprintf( stderr, "[Init_forces] Invalid matrix position. Terminating...\n" );
                 exit( INVALID_INPUT );
@@ -449,10 +469,12 @@ static inline real Init_Charge_Matrix_Entry( reax_system *system,
 
 
 static void Init_Charge_Matrix_Remaining_Entries( reax_system *system,
-        control_params *control, sparse_matrix * H, sparse_matrix * H_sp,
+        control_params *control, list *far_nbrs,
+        sparse_matrix * H, sparse_matrix * H_sp,
         int * Htop, int * H_sp_top )
 {
-    int i;
+    int i, j, pj;
+    real d, xcut, bond_softness, * X_diag;
 
     switch ( control->charge_method )
     {
@@ -484,6 +506,130 @@ static void Init_Charge_Matrix_Remaining_Entries( reax_system *system,
             break;
 
         case ACKS2_CM:
+            if ( (X_diag = (real*) calloc(system->N, sizeof(real))) == NULL )
+            {
+                fprintf( stderr, "not enough memory for charge matrix. terminating.\n" );
+                exit( INSUFFICIENT_MEMORY );
+            }
+
+            H->start[system->N] = *Htop;
+            H_sp->start[system->N] = *H_sp_top;
+
+            for ( i = 0; i < system->N; ++i )
+            {
+                H->j[*Htop] = i;
+                H->val[*Htop] = 1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = 1.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->j[*Htop] = system->N;
+            H->val[*Htop] = 0.0;
+            *Htop = *Htop + 1;
+
+            H_sp->j[*H_sp_top] = system->N;
+            H_sp->val[*H_sp_top] = 0.0;
+            *H_sp_top = *H_sp_top + 1;
+
+            for ( i = 0; i < system->N; ++i )
+            {
+                H->start[system->N + i + 1] = *Htop;
+                H_sp->start[system->N + i + 1] = *H_sp_top;
+
+                H->j[*Htop] = i;
+                H->val[*Htop] = 1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = 1.0;
+                *H_sp_top = *H_sp_top + 1;
+
+                for ( pj = Start_Index(i, far_nbrs); pj < End_Index(i, far_nbrs); ++pj )
+                {
+                    if ( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut )
+                    {
+                        j = far_nbrs->select.far_nbr_list[pj].nbr;
+
+                        xcut = ( system->reaxprm.sbp[ system->atoms[i].type ].b_s_acks2
+                                + system->reaxprm.sbp[ system->atoms[j].type ].b_s_acks2 )
+                            / 2.0;
+
+                        if ( far_nbrs->select.far_nbr_list[pj].d < xcut &&
+                                far_nbrs->select.far_nbr_list[pj].d > 0.001 )
+                        {
+                            d = far_nbrs->select.far_nbr_list[pj].d / xcut;
+                            bond_softness = system->reaxprm.gp.l[34] * POW( d, 3.0 ) * POW( 1.0 - d, 6.0 );
+
+                            H->j[*Htop] = system->N + j + 1;
+                            H->val[*Htop] = MAX( 0.0, bond_softness );
+                            *Htop = *Htop + 1;
+
+                            H_sp->j[*H_sp_top] = system->N + j + 1;
+                            H_sp->val[*H_sp_top] = MAX( 0.0, bond_softness );
+                            *H_sp_top = *H_sp_top + 1;
+
+                            X_diag[i] -= bond_softness;
+                            X_diag[j] -= bond_softness;
+                        }
+                    }
+                }
+
+                H->j[*Htop] = system->N + i + 1;
+                H->val[*Htop] = 0.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = system->N + i + 1;
+                H_sp->val[*H_sp_top] = 0.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->start[system->N_cm - 1] = *Htop;
+            H_sp->start[system->N_cm - 1] = *H_sp_top;
+
+            for ( i = system->N + 1; i < system->N_cm - 1; ++i )
+            {
+                for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
+                {
+                    if ( H->j[pj] == i )
+                    {
+                        H->val[pj] = X_diag[i - system->N - 1];
+                        break;
+                    }
+                }
+
+                for ( pj = H_sp->start[i]; pj < H_sp->start[i + 1]; ++pj )
+                {
+                    if ( H_sp->j[pj] == i )
+                    {
+                        H_sp->val[pj] = X_diag[i - system->N - 1];
+                        break;
+                    }
+                }
+            }
+
+            for ( i = system->N + 1; i < system->N_cm - 1; ++i )
+            {
+                H->j[*Htop] = i;
+                H->val[*Htop] = 1.0;
+                *Htop = *Htop + 1;
+
+                H_sp->j[*H_sp_top] = i;
+                H_sp->val[*H_sp_top] = 1.0;
+                *H_sp_top = *H_sp_top + 1;
+            }
+
+            H->j[*Htop] = system->N_cm - 1;
+            H->val[*Htop] = 0.0;
+            *Htop = *Htop + 1;
+
+            H_sp->j[*H_sp_top] = system->N_cm - 1;
+            H_sp->val[*H_sp_top] = 0.0;
+            *H_sp_top = *H_sp_top + 1;
+
+            free( X_diag );
             break;
 
         default:
@@ -775,7 +921,8 @@ void Init_Forces( reax_system *system, control_params *control,
         //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
     }
 
-    Init_Charge_Matrix_Remaining_Entries( system, control, H, H_sp, &Htop, &H_sp_top );
+    Init_Charge_Matrix_Remaining_Entries( system, control, far_nbrs,
+            H, H_sp, &Htop, &H_sp_top );
 
     // mark the end of j list
     H->start[system->N_cm] = Htop;
@@ -1062,8 +1209,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
 
 void Estimate_Storage_Sizes( reax_system *system, control_params *control,
-                             list **lists, int *Htop, int *hb_top,
-                             int *bond_top, int *num_3body )
+        list **lists, int *Htop, int *hb_top, int *bond_top, int *num_3body )
 {
     int i, j, pj;
     int start_i, end_i;
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index 58c5c98c..2ddcbeb6 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -375,7 +375,24 @@ void Init_Workspace( reax_system *system, control_params *control,
             break;
 
         case ACKS2_CM:
-            //TODO
+            for ( i = 0; i < system->N; ++i )
+            {
+                workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+            }
+
+            workspace->b_s[system->N] = control->cm_q_net;
+            workspace->b[system->N] = control->cm_q_net;
+
+            for ( i = system->N + 1; i < system->N_cm; ++i )
+            {
+                workspace->b_s[i] = 0.0;
+
+                //TODO: check if unused (redundant)
+                workspace->b[i] = 0.0;
+            }
             break;
 
         default:
@@ -489,7 +506,7 @@ void Init_Lists( reax_system *system, control_params *control,
                  simulation_data *data, static_storage *workspace,
                  list **lists, output_controls *out_control )
 {
-    int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
+    int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop, max_nnz;
     int *hb_top, *bond_top;
 
     num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
@@ -513,7 +530,23 @@ void Init_Lists( reax_system *system, control_params *control,
     Estimate_Storage_Sizes( system, control, lists, &Htop,
             hb_top, bond_top, &num_3body );
 
-    if ( Allocate_Matrix( &(workspace->H), system->N_cm, Htop ) == FAILURE )
+    switch ( control->charge_method )
+    {
+        case QEQ_CM:
+            max_nnz = Htop;
+            break;
+        case EEM_CM:
+            max_nnz = Htop + system->N_cm;
+            break;
+        case ACKS2_CM:
+            max_nnz = 2 * Htop + 3 * system->N + 2;
+            break;
+        default:
+            max_nnz = Htop;
+            break;
+    }
+
+    if ( Allocate_Matrix( &(workspace->H), system->N_cm, max_nnz ) == FAILURE )
     {
         fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
         exit( INSUFFICIENT_MEMORY );
@@ -522,7 +555,7 @@ void Init_Lists( reax_system *system, control_params *control,
      *   If so, need to refactor Estimate_Storage_Sizes
      *   to use various cut-off distances as parameters
      *   (non-bonded, hydrogen, 3body, etc.) */
-    if ( Allocate_Matrix( &(workspace->H_sp), system->N_cm, Htop ) == FAILURE )
+    if ( Allocate_Matrix( &(workspace->H_sp), system->N_cm, max_nnz ) == FAILURE )
     {
         fprintf( stderr, "Not enough space for init matrices. Terminating...\n" );
         exit( INSUFFICIENT_MEMORY );
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index 94aeeaf0..826dee38 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -251,7 +251,7 @@ l[30] = p_coa4
 l[31] = p_ovun4
 l[32] = p_ovun3
 l[33] = p_val8
-l[34] = N/A
+l[34] = ACKS2 bond softness
 l[35] = N/A
 l[36] = N/A
 l[37] = version number
@@ -298,6 +298,7 @@ typedef struct
     real b_o_131;
     real b_o_132;
     real b_o_133;
+    real b_s_acks2; /* bond softness for ACKS2 */
 
     /* Line four in the field file */
     real p_ovun2;
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index 015e2573..8a595276 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -838,7 +838,7 @@ void Print_Sparse_Matrix2( sparse_matrix *A, char *fname )
         {
             //fprintf( f, "%d%d %.15e\n", A->entries[j].j, i, A->entries[j].val );
             //Convert 0-based to 1-based (for Matlab)
-            fprintf( f, "%6d%6d %24.15e\n", i+1, A->j[j]+1, A->val[j] );
+            fprintf( f, "%6d %6d %24.15e\n", i+1, A->j[j]+1, A->val[j] );
         }
     }
 
diff --git a/sPuReMD/src/testmd.c b/sPuReMD/src/testmd.c
index 075a49b1..9ff007a9 100644
--- a/sPuReMD/src/testmd.c
+++ b/sPuReMD/src/testmd.c
@@ -176,8 +176,8 @@ int main(int argc, char* argv[])
     /* compute f_0 */
     //if( control.restart == 0 ) {
     Reset( &system, &control, &data, &workspace, &lists );
-    Generate_Neighbor_Lists( &system, &control, &data, &workspace,
-                             &lists, &out_control );
+    Generate_Neighbor_Lists( &system, &control, &data, &workspace, 
+            &lists, &out_control );
 
     //fprintf( stderr, "total: %.2f secs\n", data.timing.nbrs);
     Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
@@ -186,8 +186,7 @@ int main(int argc, char* argv[])
     ++data.step;
     //}
     //
-
-
+    
     for ( ; data.step <= control.nsteps; data.step++ )
     {
         if ( control.T_mode )
-- 
GitLab