From c04ce605363314ade649f55f0cf05ae3bd466dbc Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Thu, 30 Aug 2018 17:01:14 -0400
Subject: [PATCH] sPuReMD: add ILUT and work-in-progress ILUTP.

---
 sPuReMD/src/charges.c    |  47 +++--
 sPuReMD/src/init_md.c    |   3 +
 sPuReMD/src/lin_alg.c    | 373 +++++++++++++++++++++++++++++----------
 sPuReMD/src/lin_alg.h    |   3 +
 sPuReMD/src/reax_types.h |   1 +
 5 files changed, 321 insertions(+), 106 deletions(-)

diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 7daeb3d4..7528977d 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -216,8 +216,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
             break;
 
         case JACOBI_PC:
-            data->timing.cm_solver_pre_comp +=
-                jacobi( Hptr, workspace->Hdia_inv );
+            data->timing.cm_solver_pre_comp += jacobi( Hptr, workspace->Hdia_inv );
             break;
 
         case ICHOLT_PC:
@@ -230,6 +229,11 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
                 ILUT( Hptr, workspace->droptol, workspace->L, workspace->U );
             break;
 
+        case ILUTP_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILUTP( Hptr, workspace->droptol, workspace->L, workspace->U );
+            break;
+
         case FG_ILUT_PC:
             data->timing.cm_solver_pre_comp +=
                 FG_ILUT( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
@@ -242,7 +246,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
                 sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
                         &workspace->H_app_inv );
 #else
-            fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+            fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile to enable. Terminating...\n" );
             exit( INVALID_INPUT );
 #endif
             break;
@@ -306,8 +310,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 //    switch ( control->cm_solver_pre_comp_type )
 //    {
 //    case JACOBI_PC:
-//        data->timing.cm_solver_pre_comp +=
-//            jacobi( Hptr, workspace->Hdia_inv );
+//        data->timing.cm_solver_pre_comp += jacobi( Hptr, workspace->Hdia_inv );
 //        break;
 //
 //    case ICHOLT_PC:
@@ -320,10 +323,9 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 //                ILUT( Hptr, workspace->droptol, workspace->L, workspace->U );
 //            break;
 //
-//        case ILUT_PC:
+//        case ILUTP_PC:
 //            data->timing.cm_solver_pre_comp +=
-//                ILUT( Hptr, workspace->droptol, workspace->L, workspace->U );
-//            break;
+//                ILUTP( Hptr, workspace->droptol, workspace->L, workspace->U );
 //
 //    case FG_ILUT_PC:
 //        data->timing.cm_solver_pre_comp +=
@@ -538,6 +540,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
@@ -558,8 +561,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
             break;
 
         case JACOBI_PC:
-            data->timing.cm_solver_pre_comp +=
-                jacobi( Hptr, workspace->Hdia_inv );
+            data->timing.cm_solver_pre_comp += jacobi( Hptr, workspace->Hdia_inv );
             break;
 
         case ICHOLT_PC:
@@ -572,6 +574,10 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
                 ILUT( Hptr, workspace->droptol, workspace->L, workspace->U );
             break;
 
+        case ILUTP_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILUTP( Hptr, workspace->droptol, workspace->L, workspace->U );
+
         case FG_ILUT_PC:
             data->timing.cm_solver_pre_comp +=
                 FG_ILUT( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
@@ -584,7 +590,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
                 sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
                         &workspace->H_app_inv );
 #else
-            fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+            fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile to enable. Terminating...\n" );
             exit( INVALID_INPUT );
 #endif
             break;
@@ -610,6 +616,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
@@ -660,6 +667,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
@@ -681,8 +689,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
             break;
 
         case JACOBI_PC:
-            data->timing.cm_solver_pre_comp +=
-                jacobi( Hptr, workspace->Hdia_inv );
+            data->timing.cm_solver_pre_comp += jacobi( Hptr, workspace->Hdia_inv );
             break;
 
         case ICHOLT_PC:
@@ -695,6 +702,10 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
                 ILUT( Hptr, workspace->droptol, workspace->L, workspace->U );
             break;
 
+        case ILUTP_PC:
+            data->timing.cm_solver_pre_comp +=
+                ILUTP( Hptr, workspace->droptol, workspace->L, workspace->U );
+
         case FG_ILUT_PC:
             data->timing.cm_solver_pre_comp +=
                 FG_ILUT( Hptr, workspace->droptol, control->cm_solver_pre_comp_sweeps,
@@ -707,7 +718,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
                 sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
                         &workspace->H_app_inv );
 #else
-            fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
+            fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile to enable. Terminating...\n" );
             exit( INVALID_INPUT );
 #endif
             break;
@@ -733,6 +744,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
@@ -820,6 +832,7 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
             break;
 
         case ILUT_PC:
+        case ILUTP_PC:
         case FG_ILUT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
@@ -887,6 +900,7 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
@@ -926,6 +940,7 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
             break;
 
         case ILUT_PC:
+        case ILUTP_PC:
         case FG_ILUT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
@@ -963,6 +978,7 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
@@ -1000,6 +1016,7 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
@@ -1041,6 +1058,7 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
             break;
 
         case ILUT_PC:
+        case ILUTP_PC:
         case FG_ILUT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
@@ -1077,6 +1095,7 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
 
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index d019bba8..4461a9cd 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -349,6 +349,7 @@ static void Init_Workspace( reax_system *system, control_params *control,
     workspace->Hdia_inv = NULL;
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         workspace->droptol = scalloc( system->N_cm, sizeof( real ),
@@ -1203,6 +1204,7 @@ static void Finalize_Workspace( reax_system *system, control_params *control,
     Deallocate_Matrix( workspace->H_sp );
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         Deallocate_Matrix( workspace->L );
@@ -1233,6 +1235,7 @@ static void Finalize_Workspace( reax_system *system, control_params *control,
     }
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC
             || control->cm_solver_pre_comp_type == ILUT_PC
+            || control->cm_solver_pre_comp_type == ILUTP_PC
             || control->cm_solver_pre_comp_type == FG_ILUT_PC )
     {
         sfree( workspace->droptol, "Finalize_Workspace::workspace->droptol" );
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 3fb32c98..4a51afef 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -358,8 +358,14 @@ void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix *
 }
 
 
+/* Computes the 2-norm of each row of A, multiplied by the
+ * dropping tolerance dtol 
+ * 
+ * A: symmetric (lower triangular portion only stored), square matrix, CSR format
+ * droptol (output): row-wise dropping tolereances
+ * dtol: user-specified dropping tolerance */
 void Calculate_Droptol( const sparse_matrix * const A,
-                        real * const droptol, const real dtol )
+        real * const droptol, const real dtol )
 {
     int i, j, k;
     real val;
@@ -387,7 +393,6 @@ void Calculate_Droptol( const sparse_matrix * const A,
         #pragma omp barrier
 #endif
 
-        /* init droptol to 0 */
         for ( i = 0; i < A->n; ++i )
         {
 #ifdef _OPENMP
@@ -447,17 +452,13 @@ void Calculate_Droptol( const sparse_matrix * const A,
 #endif
 
         /* calculate local droptol for each row */
-        //fprintf( stderr, "droptol: " );
 #ifdef _OPENMP
         #pragma omp for schedule(static)
 #endif
         for ( i = 0; i < A->n; ++i )
         {
-            //fprintf( stderr, "%f-->", droptol[i] );
             droptol[i] = SQRT( droptol[i] ) * dtol;
-            //fprintf( stderr, "%f  ", droptol[i] );
         }
-        //fprintf( stderr, "\n" );
     }
 }
 
@@ -668,122 +669,309 @@ real ICHOLT( const sparse_matrix * const A, const real * const droptol,
  * Iterative Methods for Sparse Linear System, Second Edition, 2003,
  * Yousef Saad
  *
- * A: symmetric, half-stored (lower triangular), CSR format
+ * A: symmetric (lower triangular portion only stored), square matrix, CSR format
  * droptol: row-wise tolerances used for dropping
  * L / U: (output) triangular matrices, A \approx LU, CSR format */
 real ILUT( const sparse_matrix * const A, const real * const droptol,
              sparse_matrix * const L, sparse_matrix * const U )
 {
-    int *tmp_j;
-    real *tmp_val;
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val, start;
-    unsigned int *Utop;
+    int i, k, pj, Ltop, Utop, *nz_mask;
+    real *w, start;
+    sparse_matrix *A_T;
 
     start = Get_Time( );
 
-    Utop = smalloc( (A->n + 1) * sizeof(unsigned int), "ILUT::Utop" );
-    tmp_j = smalloc( A->n * sizeof(int), "ILUT::Utop" );
-    tmp_val = smalloc( A->n * sizeof(real), "ILUT::Utop" );
+    /* use a dense vector with masking for the intermediate row w */
+    w = smalloc( sizeof(real) * A->n, "ILUT::w" );
+    nz_mask = smalloc( sizeof(int) * A->n, "ILUT::nz_mask" );
+
+    /* need upper triangular portion of A, so transpose to get in CSR format */
+    Allocate_Matrix( &A_T, A->n, A->m );
+
+    Transpose( A, A_T );
 
     Ltop = 0;
-    tmptop = 0;
-    memset( L->start, 0, (A->n + 1) * sizeof(unsigned int) );
-    memset( U->start, 0, (A->n + 1) * sizeof(unsigned int) );
-    memset( Utop, 0, A->n * sizeof(unsigned int) );
+    Utop = 0;
 
     for ( i = 0; i < A->n; ++i )
     {
+        /* mark the start of new row i in L and U */
         L->start[i] = Ltop;
-        tmptop = 0;
+        U->start[i] = Utop;
 
-        for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
+        for ( k = 0; k < A->n; ++k )
         {
-            j = A->j[pj];
-            val = A->val[pj];
+            w[k] = 0.0;
+            nz_mask[k] = 0;
+        }
 
-            if ( FABS(val) > droptol[i] )
+        /* copy i-th row of A into w */
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        {
+            k = A->j[pj];
+            w[k] = A->val[pj];
+            nz_mask[k] = 1;
+        }
+        for ( pj = A_T->start[i] + 1; pj < A_T->start[i + 1]; ++pj )
+        {
+            k = A_T->j[pj];
+            w[k] = A_T->val[pj];
+            nz_mask[k] = 1;
+        }
+
+        for ( k = 0; k < i; ++k )
+        {
+            if ( nz_mask[k] == 1 )
             {
-                k1 = 0;
-                k2 = L->start[j];
-                while ( k1 < tmptop && k2 < L->start[j + 1] )
+                w[k] /= A->val[A->start[k + 1] - 1];
+
+                /* apply dropping rule to w[k] */
+                if ( FABS( w[k] ) < droptol[k] )
                 {
-                    if ( tmp_j[k1] < L->j[k2] )
-                    {
-                        ++k1;
-                    }
-                    else if ( tmp_j[k1] > L->j[k2] )
-                    {
-                        ++k2;
-                    }
-                    else
+                    nz_mask[k] = 0;
+                }
+
+                if ( nz_mask[k] == 1 )
+                {
+                    for ( pj = U->start[k]; pj < U->start[k + 1]; ++pj )
                     {
-                        val -= (tmp_val[k1++] * L->val[k2++]);
+                        /* do not introduce new non-zeros (0-fillin) */
+                        if ( nz_mask[U->j[pj]] == 1 )
+                        {
+                            w[U->j[pj]] -= w[k] * U->val[pj];
+                        }
                     }
                 }
+            }
+        }
 
-                /* L matrix is lower triangular,
-                 * so right before the start of next row comes jth diagonal */
-                val /= L->val[L->start[j + 1] - 1];
+        /* apply dropping rule to w, but keep the diagonal regardless;
+         * note: this is different than Saad's suggested approach
+         * as we do not limit the NNZ per row */
+        for ( k = 0; k < A->n; ++k )
+        {
+            if ( nz_mask[k] == 1 && FABS( w[k] ) < droptol[i] )
+            {
+                nz_mask[k] = 0;
+            }
+        }
 
-                tmp_j[tmptop] = j;
-                tmp_val[tmptop] = val;
-                ++tmptop;
+        /* copy w[0:i] to row i of L */
+        for ( k = 0; k < i; ++k )
+        {
+            if ( nz_mask[k] == 1 )
+            {
+                L->j[Ltop] = k;
+                L->val[Ltop] = w[k];
+                ++Ltop;
             }
         }
 
-        /* sanity check */
-        if ( A->j[pj] != i )
+        /* unit diagonal for L */
+        L->j[Ltop] = i;
+        L->val[Ltop] = 1.0;
+        ++Ltop;
+
+        /* copy w[i:n] to row i of L */
+        for ( k = i; k < A->n; ++k )
         {
-            fprintf( stderr, "[ERROR] badly built input matrix in ILUT (i = %d)\n", i );
-            exit( NUMERIC_BREAKDOWN );
+            if ( nz_mask[k] == 1 )
+            {
+                U->j[Utop] = k;
+                U->val[Utop] = w[k];
+                ++Utop;
+            }
         }
+    }
 
-        /* compute the ith diagonal in L */
-        val = A->val[pj];
-        for ( k1 = 0; k1 < tmptop; ++k1 )
+    /* record the total NNZ as the last entry in row pointer (start) */
+    L->start[L->n] = Ltop;
+    U->start[U->n] = Utop;
+
+    Deallocate_Matrix( A_T );
+    sfree( nz_mask, "ILUT::nz_mask" );
+    sfree( w, "ILUT::w" );
+
+    return Get_Timing_Info( start );
+}
+
+
+/* Compute incomplete LU factorization with thresholding and partial pivoting
+ *
+ * Reference:
+ * Iterative Methods for Sparse Linear System, Second Edition, 2003,
+ * Yousef Saad
+ *
+ * A: symmetric (lower triangular portion only stored), square matrix, CSR format
+ * droptol: row-wise tolerances used for dropping
+ * L / U: (output) triangular matrices, A \approx LU, CSR format */
+real ILUTP( const sparse_matrix * const A, const real * const droptol,
+             sparse_matrix * const L, sparse_matrix * const U )
+{
+    int i, k, pj, Ltop, Utop, *nz_mask, *perm, *perm_inv, pivot_j;
+    real *w, start, pivot_val;
+    sparse_matrix *A_T;
+
+    start = Get_Time( );
+
+    /* use a dense vector with masking for the intermediate row w */
+    w = smalloc( sizeof(real) * A->n, "ILUTP::w" );
+    nz_mask = smalloc( sizeof(int) * A->n, "ILUTP::nz_mask" );
+    perm = smalloc( sizeof(int) * A->n, "ILUTP::perm" );
+    perm_inv = smalloc( sizeof(int) * A->n, "ILUTP::perm_inv" );
+
+    /* need upper triangular portion of A, so transpose to get in CSR format */
+    Allocate_Matrix( &A_T, A->n, A->m );
+
+    Transpose( A, A_T );
+
+    Ltop = 0;
+    Utop = 0;
+
+    for ( i = 0; i < A->n; ++i )
+    {
+        perm[i] = i;
+        perm_inv[perm[i]] = i;
+    }
+
+    for ( i = 0; i < A->n; ++i )
+    {
+        /* mark the start of new row i in L and U */
+        L->start[i] = Ltop;
+        U->start[i] = Utop;
+
+        for ( k = 0; k < A->n; ++k )
         {
-            val -= (tmp_val[k1] * tmp_val[k1]);
+            w[k] = 0.0;
+            nz_mask[k] = 0;
         }
 
-#if defined(DEBUG)
-        if ( val < 0.0 )
+        /* copy i-th row of A into w */
+        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
         {
-            fprintf( stderr, "[ERROR] Numeric breakdown in ILUT (SQRT of negative on diagonal i = %d). Terminating.\n", i );
-            exit( NUMERIC_BREAKDOWN );
+            k = A->j[pj];
+            w[k] = A->val[pj];
+            nz_mask[k] = 1;
+        }
+        for ( pj = A_T->start[i] + 1; pj < A_T->start[i + 1]; ++pj )
+        {
+            k = A_T->j[pj];
+            w[k] = A_T->val[pj];
+            nz_mask[k] = 1;
+        }
 
+        /* partial pivoting by columns:
+         * find largest element in w, and make it the pivot */
+        pivot_val = FABS( w[0] );
+        pivot_j = 0;
+        for ( k = 1; k < A->n; ++k )
+        {
+            if ( FABS( w[k] ) > pivot_val )
+            {
+                pivot_val = FABS( w[k] );
+                pivot_j = k;
+            }
         }
-#endif
+        perm[i] = pivot_j;
+        perm_inv[perm[i]] = i;
+        perm[pivot_j] = i;
+        perm_inv[perm[pivot_j]] = pivot_j;
 
-        tmp_j[tmptop] = i;
-        tmp_val[tmptop] = SQRT( val );
+        for ( k = 0; k < i; ++k )
+        {
+            if ( nz_mask[perm[k]] == 1 )
+            {
+                if ( pivot_j <= i )
+                {
+                    for ( pj = A->start[k]; pj < A->start[k + 1]; ++pj )
+                    {
+                        if ( A->j[pj] == perm_inv[k] )
+                        {
+                            w[perm[k]] /= A->val[pj];
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    for ( pj = A_T->start[k] + 1; pj < A_T->start[k + 1]; ++pj )
+                    {
+                        if ( A_T->j[pj] == perm_inv[k] )
+                        {
+                            w[perm[k]] /= A_T->val[pj];
+                            break;
+                        }
+                    }
+                }
 
-        /* apply the dropping rule once again */
-        for ( k1 = 0; k1 < tmptop; ++k1 )
+                /* apply dropping rule to w[perm[k]] */
+                if ( FABS( w[perm[k]] ) < droptol[perm[k]] )
+                {
+                    nz_mask[perm[k]] = 0;
+                }
+
+                if ( nz_mask[perm[k]] == 1 )
+                {
+                    for ( pj = U->start[k]; pj < U->start[k + 1]; ++pj )
+                    {
+                        /* do not introduce new non-zeros (0-fillin) */
+                        if ( nz_mask[perm[U->j[pj]]] == 1 )
+                        {
+                            w[perm[U->j[pj]]] -= w[perm[k]] * U->val[pj];
+                        }
+                    }
+                }
+            }
+        }
+
+        /* apply dropping rule to w, but keep the diagonal regardless;
+         * note: this is different than Saad's suggested approach
+         * as we do not limit the NNZ per row */
+        for ( k = 0; k < A->n; ++k )
         {
-            if ( FABS(tmp_val[k1]) > droptol[i] / tmp_val[tmptop] )
+            if ( perm_inv[k] != i && nz_mask[k] == 1 && FABS( w[k] ) < droptol[i] )
             {
-                L->j[Ltop] = tmp_j[k1];
-                L->val[Ltop] = tmp_val[k1];
-                U->start[tmp_j[k1] + 1]++;
+                nz_mask[k] = 0;
+            }
+        }
+
+        /* copy w[0:i] to row i of L */
+        for ( k = 0; k < i; ++k )
+        {
+            if ( nz_mask[perm_inv[k]] == 1 )
+            {
+                L->j[Ltop] = k;
+                L->val[Ltop] = w[perm_inv[k]];
                 ++Ltop;
             }
         }
 
-        /* keep the diagonal in any case */
-        L->j[Ltop] = tmp_j[k1];
-        L->val[Ltop] = tmp_val[k1];
+        /* unit diagonal for L */
+        L->j[Ltop] = i;
+        L->val[Ltop] = 1.0;
         ++Ltop;
-    }
 
-    L->start[i] = Ltop;
+        /* copy w[i:n] to row i of L */
+        for ( k = i; k < A->n; ++k )
+        {
+            if ( nz_mask[perm_inv[k]] == 1 )
+            {
+                U->j[Utop] = k;
+                U->val[Utop] = w[perm_inv[k]];
+                ++Utop;
+            }
+        }
+    }
 
-    /* U = L^T (Cholesky factorization) */
-    Transpose( L, U );
+    /* record the total NNZ as the last entry in row pointer (start) */
+    L->start[L->n] = Ltop;
+    U->start[U->n] = Utop;
 
-    sfree( tmp_val, "ILUT::tmp_val" );
-    sfree( tmp_j, "ILUT::tmp_j" );
-    sfree( Utop, "ILUT::Utop" );
+    Deallocate_Matrix( A_T );
+    sfree( perm_inv, "ILUTP::perm_inv" );
+    sfree( perm, "ILUTP::perm" );
+    sfree( nz_mask, "ILUTP::nz_mask" );
+    sfree( w, "ILUTP::w" );
 
     return Get_Timing_Info( start );
 }
@@ -1411,12 +1599,13 @@ real sparse_approx_inverse( const sparse_matrix * const A,
 #endif
 
 
-/* sparse matrix-vector product Ax = b
+/* sparse matrix, dense vector multiplication Ax = b
  *
  * workspace: storage container for workspace structures
- * A: lower triangular matrix, stored in CSR format
- * x: vector
- * b (output): vector */
+ * A: symmetric (lower triangular portion only stored), square matrix,
+ *    stored in CSR format
+ * x: dense vector, size equal to num. columns in A
+ * b (output): dense vector, size equal to num. columns in A */
 static void Sparse_MatVec( const static_storage * const workspace,
         const sparse_matrix * const A, const real * const x, real * const b )
 {
@@ -1475,13 +1664,13 @@ static void Sparse_MatVec( const static_storage * const workspace,
 }
 
 
-/* sparse matrix-vector product Ax = b
- * where:
- *   A: matrix, stored in CSR format
- *   x: vector
- *   b: vector (result) */
+/* sparse matrix, dense vector multiplication Ax = b
+ *
+ * A: square matrix, stored in CSR format
+ * x: dense vector, size equal to num. columns in A
+ * b (output): dense vector, size equal to num. columns in A */
 static void Sparse_MatVec_full( const sparse_matrix * const A,
-                                const real * const x, real * const b )
+        const real * const x, real * const b )
 {
     int i, pj;
 
@@ -1708,10 +1897,10 @@ void tri_solve_level_sched( static_storage * workspace,
 
                 workspace->levels_L = levels;
 
-                //#if defined(DEBUG)
-                fprintf(stderr, "levels(L): %d\n", levels);
-                fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
-                //#endif
+#if defined(DEBUG)
+                fprintf(stderr, "[INFO] levels(L): %d\n", levels);
+                fprintf(stderr, "[INFO] NNZ(L): %d\n", LU->start[N]);
+#endif
             }
             else
             {
@@ -1730,10 +1919,10 @@ void tri_solve_level_sched( static_storage * workspace,
 
                 workspace->levels_U = levels;
 
-                //#if defined(DEBUG)
-                fprintf(stderr, "levels(U): %d\n", levels);
-                fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
-                //#endif
+#if defined(DEBUG)
+                fprintf(stderr, "[INFO] levels(U): %d\n", levels);
+                fprintf(stderr, "[INFO] NNZ(U): %d\n", LU->start[N]);
+#endif
             }
 
             for ( i = 1; i < levels + 1; ++i )
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index 5b9f0992..3a02d452 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -48,6 +48,9 @@ real ICHOLT( const sparse_matrix * const, const real * const,
 real ILUT( const sparse_matrix * const, const real * const,
         sparse_matrix * const, sparse_matrix * const );
 
+real ILUTP( const sparse_matrix * const, const real * const,
+        sparse_matrix * const, sparse_matrix * const );
+
 #if defined(TESTING)
 real FG_ICHOL( const sparse_matrix * const, const unsigned int,
         sparse_matrix * const, sparse_matrix * const );
diff --git a/sPuReMD/src/reax_types.h b/sPuReMD/src/reax_types.h
index 26cd3aca..e26363a4 100644
--- a/sPuReMD/src/reax_types.h
+++ b/sPuReMD/src/reax_types.h
@@ -245,6 +245,7 @@ enum pre_comp
     JACOBI_PC = 1,
     ICHOLT_PC = 2,
     ILUT_PC = 3,
+    ILUTP_PC = 4,
     FG_ILUT_PC = 5,
     SAI_PC = 6,
 };
-- 
GitLab