From 0cdc7afc9f52aecef0d1c14eb409e7adc235f31f Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnk@msu.edu>
Date: Mon, 19 Feb 2018 14:34:15 -0500
Subject: [PATCH] sPuReMD: fix issue with graph coloring causing sporadic
 failures in solver convergence for EE and ACKS2. Conditionally change the
 charge matrix for ICHOLT and ILU(T) preconditioning. Refactor graph coloring
 code.

---
 sPuReMD/src/charges.c | 123 +++++++-----
 sPuReMD/src/init_md.c |  68 ++++++-
 sPuReMD/src/lin_alg.c | 437 ++++++++++++++++--------------------------
 sPuReMD/src/lin_alg.h |   4 +-
 sPuReMD/src/mytypes.h |  19 +-
 5 files changed, 323 insertions(+), 328 deletions(-)

diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 61c5c8a7..72ad3331 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -146,26 +146,19 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
         Hptr = workspace->H;
     }
 
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
     time = Get_Time( );
     if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
     {
-        if ( control->cm_domain_sparsify_enabled == TRUE )
-        {
-            Hptr = setup_graph_coloring( workspace->H_sp );
-        }
-        else
-        {
-            Hptr = setup_graph_coloring( workspace->H );
-        }
-
-        Sort_Matrix_Rows( Hptr );
+        setup_graph_coloring( control, workspace, Hptr, &workspace->H_full, &workspace->H_p );
+        Sort_Matrix_Rows( workspace->H_p );
+        Hptr = workspace->H_p;
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
 
-#if defined(TEST_MAT)
-    Hptr = create_test_mat( );
-#endif
-
     switch ( control->cm_solver_pre_comp_type )
     {
         case NONE_PC:
@@ -503,27 +496,25 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
         Hptr = workspace->H;
     }
 
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    }
+
     time = Get_Time( );
     if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
     {
-        if ( control->cm_domain_sparsify_enabled == TRUE )
-        {
-            Hptr = setup_graph_coloring( workspace->H_sp );
-        }
-        else
-        {
-            Hptr = setup_graph_coloring( workspace->H );
-        }
-
-        Sort_Matrix_Rows( Hptr );
+        setup_graph_coloring( control, workspace, Hptr, &workspace->H_full, &workspace->H_p );
+        Sort_Matrix_Rows( workspace->H_p );
+        Hptr = workspace->H_p;
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
-
-#if defined(TEST_MAT)
-    Hptr = create_test_mat( );
-#endif
-
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
     
     switch ( control->cm_solver_pre_comp_type )
     {
@@ -578,7 +569,24 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
             break;
     }
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = workspace->H_sp;
+        }
+        else
+        {
+            Hptr = workspace->H;
+        }
+    }
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    }
 
 #if defined(DEBUG)
 #define SIZE (1000)
@@ -620,28 +628,26 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         Hptr = workspace->H;
     }
 
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+        Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+    }
+
     time = Get_Time( );
     if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
     {
-        if ( control->cm_domain_sparsify_enabled == TRUE )
-        {
-            Hptr = setup_graph_coloring( workspace->H_sp );
-        }
-        else
-        {
-            Hptr = setup_graph_coloring( workspace->H );
-        }
-
-        Sort_Matrix_Rows( Hptr );
+        setup_graph_coloring( control, workspace, Hptr, &workspace->H_full, &workspace->H_p );
+        Sort_Matrix_Rows( workspace->H_p );
+        Hptr = workspace->H_p;
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
-
-#if defined(TEST_MAT)
-    Hptr = create_test_mat( );
-#endif
-
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
-    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
     
     switch ( control->cm_solver_pre_comp_type )
     {
@@ -696,8 +702,25 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
             break;
     }
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
-    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = workspace->H_sp;
+        }
+        else
+        {
+            Hptr = workspace->H;
+        }
+    }
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+        Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+    }
 
 #if defined(DEBUG)
 #define SIZE (1000)
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index 4e8ca7d8..7e8f0922 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -335,11 +335,14 @@ void Init_Workspace( reax_system *system, control_params *control,
             break;
     }
 
+    /* sparse matrices */
     workspace->H = NULL;
+    workspace->H_full = NULL;
     workspace->H_sp = NULL;
-    workspace->L = NULL;
+    workspace->H_p = NULL;
     workspace->H_spar_patt = NULL;
     workspace->H_app_inv = NULL;
+    workspace->L = NULL;
     workspace->U = NULL;
     workspace->Hdia_inv = NULL;
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
@@ -348,6 +351,7 @@ void Init_Workspace( reax_system *system, control_params *control,
         workspace->droptol = (real *) scalloc( system->N_cm, sizeof( real ),
                 "Init_Workspace::workspace->droptol" );
     }
+
     //TODO: check if unused
     //workspace->w        = (real *) scalloc( cm_lin_sys_size, sizeof( real ),
     //"Init_Workspace::workspace->droptol" );
@@ -497,6 +501,46 @@ void Init_Workspace( reax_system *system, control_params *control,
             break;
     }
 
+    /* graph coloring related */
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        workspace->color = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                                         "setup_graph_coloring::color" );
+        workspace->to_color = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                                            "setup_graph_coloring::to_color" );
+        workspace->conflict = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                                            "setup_graph_coloring::conflict" );
+        workspace->conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (control->num_threads + 1),
+                                                "setup_graph_coloring::conflict_cnt" );
+        workspace->recolor = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                                           "setup_graph_coloring::recolor" );
+        workspace->color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (system->N_cm + 1),
+                                             "setup_graph_coloring::color_top" );
+        workspace->permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                           "setup_graph_coloring::premuted_row_col" );
+        workspace->permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                               "setup_graph_coloring::premuted_row_col_inv" );
+        workspace->y_p = (real*) smalloc( sizeof(real) * system->N_cm,
+                               "setup_graph_coloring::y_p" );
+        workspace->x_p = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::x_p" );
+    }
+    else
+    {
+        workspace->color = NULL;
+        workspace->to_color = NULL;
+        workspace->conflict = NULL;
+        workspace->conflict_cnt = NULL;
+        workspace->temp_ptr = NULL;
+        workspace->recolor = NULL;
+        workspace->recolor_cnt = 0;
+        workspace->color_top = NULL;
+        workspace->permuted_row_col = NULL;
+        workspace->permuted_row_col_inv = NULL;
+        workspace->y_p = NULL;
+        workspace->x_p = NULL;
+    }
+
     /* integrator storage */
     workspace->a = (rvec *) smalloc( system->N * sizeof( rvec ),
            "Init_Workspace::workspace->a" );
@@ -954,6 +998,8 @@ void Initialize( reax_system *system, control_params *control,
         #pragma omp single
         control->num_threads = omp_get_num_threads( );
     }
+#else
+    control->num_threads = 1;
 #endif
 
     Randomize( );
@@ -1086,6 +1132,11 @@ void Finalize_Workspace( reax_system *system, control_params *control,
         Deallocate_Matrix( workspace->H_spar_patt_full );
         Deallocate_Matrix( workspace->H_app_inv );
     }
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        Deallocate_Matrix( workspace->H_full );
+        Deallocate_Matrix( workspace->H_p );
+    }
 
     for ( i = 0; i < 5; ++i )
     {
@@ -1160,6 +1211,21 @@ void Finalize_Workspace( reax_system *system, control_params *control,
             break;
     }
 
+    /* graph coloring related */
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        sfree( workspace->color, "Finalize_Workspace::workspace->color" );
+        sfree( workspace->to_color, "Finalize_Workspace::workspace->to_color" );
+        sfree( workspace->conflict, "Finalize_Workspace::workspace->conflict" );
+        sfree( workspace->conflict_cnt, "Finalize_Workspace::workspace->conflict_cnt" );
+        sfree( workspace->recolor, "Finalize_Workspace::workspace->recolor" );
+        sfree( workspace->color_top, "Finalize_Workspace::workspace->color_top" );
+        sfree( workspace->permuted_row_col, "Finalize_Workspace::workspace->permuted_row_col" );
+        sfree( workspace->permuted_row_col_inv, "Finalize_Workspace::workspace->permuted_row_col_inv" );
+        sfree( workspace->y_p, "Finalize_Workspace::workspace->y_p" );
+        sfree( workspace->x_p, "Finalize_Workspace::workspace->x_p" );
+    }
+
     /* integrator storage */
     sfree( workspace->a, "Finalize_Workspace::workspace->a" );
     sfree( workspace->f_old, "Finalize_Workspace::workspace->f_old" );
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 4f69152b..e1d02236 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -71,24 +71,6 @@ static unsigned int *row_levels;
 static unsigned int *level_rows;
 static unsigned int *level_rows_cnt;
 static unsigned int *top = NULL;
-/* global to make OpenMP shared (graph_coloring) */
-static unsigned int *color = NULL;
-static unsigned int *to_color = NULL;
-static unsigned int *conflict = NULL;
-static unsigned int *conflict_cnt = NULL;
-static unsigned int *temp_ptr;
-static unsigned int *recolor = NULL;
-static unsigned int recolor_cnt;
-static unsigned int *color_top = NULL;
-/* global to make OpenMP shared (sort_colors) */
-static unsigned int *permuted_row_col = NULL;
-static unsigned int *permuted_row_col_inv = NULL;
-static real *y_p = NULL;
-/* global to make OpenMP shared (permute_vector) */
-static real *x_p = NULL;
-static unsigned int *mapping = NULL;
-static sparse_matrix *H_full;
-static sparse_matrix *H_p;
 /* global to make OpenMP shared (jacobi_iter) */
 static real *Dinv_b = NULL;
 static real *rp = NULL;
@@ -225,7 +207,15 @@ static void compute_full_sparse_matrix( const sparse_matrix * const A,
             exit( INSUFFICIENT_MEMORY );
         }
     }
-
+    else if ( (*A_full)->m < 2 * A->m - A->n )
+    {
+        Deallocate_Matrix( *A_full );
+        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
+        {
+            fprintf( stderr, "not enough memory for full A. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
 
     if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
     {
@@ -1622,11 +1612,23 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
 
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
 /* Compute M^{1} \approx A which minimizes
+ *  \sum_{j=1}^{N} ||e_j - Am_j||_2^2
+ *  where: e_j is the j-th column of the NxN identify matrix,
+ *         m_j is the j-th column of the NxN approximate sparse matrix M
+ * 
+ * Internally, use LAPACKE to solve the least-squares problems
  *
  * A: symmetric, sparse matrix, stored in full CSR format
  * A_spar_patt: sparse matrix used as template sparsity pattern
  *   for approximating the inverse, stored in full CSR format
- * A_app_inv: approximate inverse to A, stored in full CSR format (result) */
+ * A_app_inv: approximate inverse to A, stored in full CSR format (result)
+ *
+ * Reference:
+ * Michele Benzi et al.
+ * A Comparative Study of Sparse Approximate Inverse
+ *  Preconditioners
+ * Applied Numerical Mathematics 30, 1999
+ * */
 real sparse_approx_inverse( const sparse_matrix * const A,
                             const sparse_matrix * const A_spar_patt,
                             sparse_matrix ** A_app_inv )
@@ -2171,7 +2173,6 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                 for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                 {
                     x[local_row] -= LU->val[pj] * x[LU->j[pj]];
-
                 }
                 x[local_row] /= LU->val[pj];
             }
@@ -2191,7 +2192,6 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                 for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                 {
                     x[local_row] -= LU->val[pj] * x[LU->j[pj]];
-
                 }
                 x[local_row] /= LU->val[LU->start[local_row]];
             }
@@ -2221,49 +2221,10 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
 }
 
 
-static void compute_H_full( const sparse_matrix * const H )
-{
-    int count, i, pj;
-    sparse_matrix *H_t;
-
-    if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for full H. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    /* Set up the sparse matrix data structure for A. */
-    Transpose( H, H_t );
-
-    count = 0;
-    for ( i = 0; i < H->n; ++i )
-    {
-        H_full->start[i] = count;
-
-        /* H: symmetric, lower triangular portion only stored */
-        for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
-        {
-            H_full->val[count] = H->val[pj];
-            H_full->j[count] = H->j[pj];
-            ++count;
-        }
-        /* H^T: symmetric, upper triangular portion only stored;
-         * skip diagonal from H^T, as included from H above */
-        for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
-        {
-            H_full->val[count] = H_t->val[pj];
-            H_full->j[count] = H_t->j[pj];
-            ++count;
-        }
-    }
-    H_full->start[i] = count;
-
-    Deallocate_Matrix( H_t );
-}
-
-
 /* Iterative greedy shared-memory parallel graph coloring
  *
+ * control: container for control info
+ * workspace: storage container for workspace structures
  * A: matrix to use for coloring, stored in CSR format;
  *   rows represent vertices, columns of entries within a row represent adjacent vertices
  *   (i.e., dependent rows for elimination during LU factorization)
@@ -2276,31 +2237,29 @@ static void compute_H_full( const sparse_matrix * const H )
  *  and Massively Threaded Architectures
  * Parallel Computing, 2012
  */
-void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
+void graph_coloring( const control_params * const control, static_storage * const workspace,
+        const sparse_matrix * const A, const TRIANGULARITY tri )
 {
 #ifdef _OPENMP
     #pragma omp parallel
 #endif
     {
-#define MAX_COLOR (500)
         int i, pj, v;
         unsigned int temp, recolor_cnt_local, *conflict_local;
-        int tid, num_thread, *fb_color;
+        int tid, *fb_color;
 
 #ifdef _OPENMP
         tid = omp_get_thread_num();
-        num_thread = omp_get_num_threads();
 #else
         tid = 0;
-        num_thread = 1;
 #endif
 
 #ifdef _OPENMP
         #pragma omp single
 #endif
         {
-            memset( color, 0, sizeof(unsigned int) * A->n );
-            recolor_cnt = A->n;
+            memset( workspace->color, 0, sizeof(unsigned int) * A->n );
+            workspace->recolor_cnt = A->n;
         }
 
         /* ordering of vertices to color depends on triangularity of factor
@@ -2312,7 +2271,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
 #endif
             for ( i = 0; i < A->n; ++i )
             {
-                to_color[i] = i;
+                workspace->to_color[i] = i;
             }
         }
         else
@@ -2322,57 +2281,54 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
 #endif
             for ( i = 0; i < A->n; ++i )
             {
-                to_color[i] = A->n - 1 - i;
+                workspace->to_color[i] = A->n - 1 - i;
             }
         }
 
-        fb_color = (int*) smalloc( sizeof(int) * MAX_COLOR,
+        fb_color = (int*) smalloc( sizeof(int) * A->n,
                                    "graph_coloring::fb_color" );
         conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n,
                          "graph_coloring::fb_color" );
 
-#ifdef _OPENMP
-        #pragma omp barrier
-#endif
-
-        while ( recolor_cnt > 0 )
+        while ( workspace->recolor_cnt > 0 )
         {
-            memset( fb_color, -1, sizeof(int) * MAX_COLOR );
+            memset( fb_color, -1, sizeof(int) * A->n );
 
             /* color vertices */
 #ifdef _OPENMP
             #pragma omp for schedule(static)
 #endif
-            for ( i = 0; i < recolor_cnt; ++i )
+            for ( i = 0; i < workspace->recolor_cnt; ++i )
             {
-                v = to_color[i];
+                v = workspace->to_color[i];
 
                 /* colors of adjacent vertices are forbidden */
                 for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                 {
                     if ( v != A->j[pj] )
                     {
-                        fb_color[color[A->j[pj]]] = v;
+                        fb_color[workspace->color[A->j[pj]]] = v;
                     }
                 }
 
                 /* search for min. color which is not in conflict with adjacent vertices;
                  * start at 1 since 0 is default (invalid) color for all vertices */
-                for ( pj = 1; fb_color[pj] == v; ++pj );
+                for ( pj = 1; fb_color[pj] == v; ++pj )
+                    ;
 
                 /* assign discovered color (no conflict in neighborhood of adjacent vertices) */
-                color[v] = pj;
+                workspace->color[v] = pj;
             }
 
             /* determine if recoloring required */
-            temp = recolor_cnt;
+            temp = workspace->recolor_cnt;
             recolor_cnt_local = 0;
 
 #ifdef _OPENMP
             #pragma omp single
 #endif
             {
-                recolor_cnt = 0;
+                workspace->recolor_cnt = 0;
             }
 
 #ifdef _OPENMP
@@ -2380,12 +2336,12 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
 #endif
             for ( i = 0; i < temp; ++i )
             {
-                v = to_color[i];
+                v = workspace->to_color[i];
 
                 /* search for color conflicts with adjacent vertices */
                 for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                 {
-                    if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
+                    if ( workspace->color[v] == workspace->color[A->j[pj]] && v > A->j[pj] )
                     {
                         conflict_local[recolor_cnt_local] = v;
                         ++recolor_cnt_local;
@@ -2395,31 +2351,27 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             }
 
             /* count thread-local conflicts and compute offsets for copying into shared buffer */
-            conflict_cnt[tid + 1] = recolor_cnt_local;
+            workspace->conflict_cnt[tid + 1] = recolor_cnt_local;
 
 #ifdef _OPENMP
             #pragma omp barrier
 
-            #pragma omp master
+            #pragma omp single
 #endif
             {
-                conflict_cnt[0] = 0;
-                for ( i = 1; i < num_thread + 1; ++i )
+                workspace->conflict_cnt[0] = 0;
+                for ( i = 1; i < control->num_threads + 1; ++i )
                 {
-                    conflict_cnt[i] += conflict_cnt[i - 1];
+                    workspace->conflict_cnt[i] += workspace->conflict_cnt[i - 1];
                 }
-                recolor_cnt = conflict_cnt[num_thread];
+                workspace->recolor_cnt = workspace->conflict_cnt[control->num_threads];
             }
 
-#ifdef _OPENMP
-            #pragma omp barrier
-#endif
-
             /* copy thread-local conflicts into shared buffer */
             for ( i = 0; i < recolor_cnt_local; ++i )
             {
-                conflict[conflict_cnt[tid] + i] = conflict_local[i];
-                color[conflict_local[i]] = 0;
+                workspace->conflict[workspace->conflict_cnt[tid] + i] = conflict_local[i];
+                workspace->color[conflict_local[i]] = 0;
             }
 
 #ifdef _OPENMP
@@ -2428,101 +2380,82 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             #pragma omp single
 #endif
             {
-                temp_ptr = to_color;
-                to_color = conflict;
-                conflict = temp_ptr;
+                workspace->temp_ptr = workspace->to_color;
+                workspace->to_color = workspace->conflict;
+                workspace->conflict = workspace->temp_ptr;
             }
         }
 
         sfree( conflict_local, "graph_coloring::conflict_local" );
         sfree( fb_color, "graph_coloring::fb_color" );
-
-        //#if defined(DEBUG)
-        //#ifdef _OPENMP
-        //    #pragma omp master
-        //#endif
-        //    {
-        //        for ( i = 0; i < A->n; ++i )
-        //            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
-        //    }
-        //#endif
-
-#ifdef _OPENMP
-        #pragma omp barrier
-#endif
     }
 }
 
 
-/* Sort coloring
+/* Sort rows by coloring
  *
+ * workspace: storage container for workspace structures
  * n: number of entries in coloring
  * tri: coloring to triangular factor to use (lower/upper)
  */
-void sort_colors( const unsigned int n, const TRIANGULARITY tri )
+void sort_rows_by_colors( static_storage * const workspace,
+        const unsigned int n, const TRIANGULARITY tri )
 {
     unsigned int i;
 
-    memset( color_top, 0, sizeof(unsigned int) * (n + 1) );
+    memset( workspace->color_top, 0, sizeof(unsigned int) * (n + 1) );
 
     /* sort vertices by color (ascending within a color)
      *  1) count colors
      *  2) determine offsets of color ranges
-     *  3) sort by color
+     *  3) sort rows by color
      *
      *  note: color is 1-based */
     for ( i = 0; i < n; ++i )
     {
-        ++color_top[color[i]];
+        ++workspace->color_top[workspace->color[i]];
     }
     for ( i = 1; i < n + 1; ++i )
     {
-        color_top[i] += color_top[i - 1];
+        workspace->color_top[i] += workspace->color_top[i - 1];
     }
     for ( i = 0; i < n; ++i )
     {
-        permuted_row_col[color_top[color[i] - 1]] = i;
-        ++color_top[color[i] - 1];
+        workspace->permuted_row_col[workspace->color_top[workspace->color[i] - 1]] = i;
+        ++workspace->color_top[workspace->color[i] - 1];
     }
 
     /* invert mapping to get map from current row/column to permuted (new) row/column */
     for ( i = 0; i < n; ++i )
     {
-        permuted_row_col_inv[permuted_row_col[i]] = i;
+        workspace->permuted_row_col_inv[workspace->permuted_row_col[i]] = i;
     }
 }
 
 
 /* Apply permutation Q^T*x or Q*x based on graph coloring
  *
+ * workspace: storage container for workspace structures
  * color: vertex color (1-based); vertices represent matrix rows/columns
  * x: vector to permute (in-place)
  * n: number of entries in x
  * invert_map: if TRUE, use Q^T, otherwise use Q
  * tri: coloring to triangular factor to use (lower/upper)
  */
-static void permute_vector( real * const x, const unsigned int n, const int invert_map,
-                            const TRIANGULARITY tri )
+static void permute_vector( static_storage * workspace,
+        real * const x, const unsigned int n, const int invert_map,
+        const TRIANGULARITY tri )
 {
     unsigned int i;
+    unsigned int *mapping;
 
-#ifdef _OPENMP
-    #pragma omp single
-#endif
+    if ( invert_map == TRUE )
     {
-        if ( x_p == NULL )
-        {
-            x_p = (real*) smalloc( sizeof(real) * n, "permute_vector::x_p" );
-        }
-
-        if ( invert_map == TRUE )
-        {
-            mapping = permuted_row_col_inv;
-        }
-        else
-        {
-            mapping = permuted_row_col;
-        }
+        mapping = workspace->permuted_row_col_inv;
+    }
+    else
+    {
+        mapping = workspace->permuted_row_col;
     }
 
 #ifdef _OPENMP
@@ -2530,25 +2463,28 @@ static void permute_vector( real * const x, const unsigned int n, const int inve
 #endif
     for ( i = 0; i < n; ++i )
     {
-        x_p[i] = x[mapping[i]];
+        workspace->x_p[i] = x[mapping[i]];
     }
 
 #ifdef _OPENMP
-    #pragma omp single
+    #pragma omp for schedule(static)
 #endif
+    for ( i = 0; i < n; ++i )
     {
-        memcpy( x, x_p, sizeof(real) * n );
+        x[i] = workspace->x_p[i];
     }
 }
 
 
 /* Apply permutation Q^T*(LU)*Q based on graph coloring
  *
+ * workspace: storage container for workspace structures
  * color: vertex color (1-based); vertices represent matrix rows/columns
  * LU: matrix to permute, stored in CSR format
  * tri: triangularity of LU (lower/upper)
  */
-void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
+void permute_matrix( static_storage * const workspace,
+        sparse_matrix * const LU, const TRIANGULARITY tri )
 {
     int i, pj, nr, nc;
     sparse_matrix *LUtemp;
@@ -2560,26 +2496,26 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
     }
 
     /* count nonzeros in each row of permuted factor (re-use color_top for counting) */
-    memset( color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
+    memset( workspace->color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
 
     if ( tri == LOWER )
     {
         for ( i = 0; i < LU->n; ++i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc <= nr )
                 {
-                    ++color_top[nr + 1];
+                    ++workspace->color_top[nr + 1];
                 }
                 /* correct entries to maintain triangularity (lower) */
                 else
                 {
-                    ++color_top[nc + 1];
+                    ++workspace->color_top[nc + 1];
                 }
             }
         }
@@ -2588,20 +2524,20 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
     {
         for ( i = LU->n - 1; i >= 0; --i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc >= nr )
                 {
-                    ++color_top[nr + 1];
+                    ++workspace->color_top[nr + 1];
                 }
                 /* correct entries to maintain triangularity (upper) */
                 else
                 {
-                    ++color_top[nc + 1];
+                    ++workspace->color_top[nc + 1];
                 }
             }
         }
@@ -2609,34 +2545,34 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
 
     for ( i = 1; i < LU->n + 1; ++i )
     {
-        color_top[i] += color_top[i - 1];
+        workspace->color_top[i] += workspace->color_top[i - 1];
     }
 
-    memcpy( LUtemp->start, color_top, sizeof(unsigned int) * (LU->n + 1) );
+    memcpy( LUtemp->start, workspace->color_top, sizeof(unsigned int) * (LU->n + 1) );
 
     /* permute factor */
     if ( tri == LOWER )
     {
         for ( i = 0; i < LU->n; ++i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc <= nr )
                 {
-                    LUtemp->j[color_top[nr]] = nc;
-                    LUtemp->val[color_top[nr]] = LU->val[pj];
-                    ++color_top[nr];
+                    LUtemp->j[workspace->color_top[nr]] = nc;
+                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
+                    ++workspace->color_top[nr];
                 }
                 /* correct entries to maintain triangularity (lower) */
                 else
                 {
-                    LUtemp->j[color_top[nc]] = nr;
-                    LUtemp->val[color_top[nc]] = LU->val[pj];
-                    ++color_top[nc];
+                    LUtemp->j[workspace->color_top[nc]] = nr;
+                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
+                    ++workspace->color_top[nc];
                 }
             }
         }
@@ -2645,24 +2581,24 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
     {
         for ( i = LU->n - 1; i >= 0; --i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc >= nr )
                 {
-                    LUtemp->j[color_top[nr]] = nc;
-                    LUtemp->val[color_top[nr]] = LU->val[pj];
-                    ++color_top[nr];
+                    LUtemp->j[workspace->color_top[nr]] = nc;
+                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
+                    ++workspace->color_top[nr];
                 }
                 /* correct entries to maintain triangularity (upper) */
                 else
                 {
-                    LUtemp->j[color_top[nc]] = nr;
-                    LUtemp->val[color_top[nc]] = LU->val[pj];
-                    ++color_top[nc];
+                    LUtemp->j[workspace->color_top[nc]] = nr;
+                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
+                    ++workspace->color_top[nc];
                 }
             }
         }
@@ -2680,62 +2616,43 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
  *  used for preconditioning (incomplete factorizations computed based on
  *  permuted H)
  *
+ * control: container for control info
+ * workspace: storage container for workspace structures
  * H: symmetric, lower triangular portion only, stored in CSR format;
- *  H is permuted in-place
+ * H_full: symmetric, stored in CSR format;
+ * H_p (output): permuted copy of H based on coloring, lower half stored, CSR format
  */
-sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
+void setup_graph_coloring( const control_params * const control,
+        static_storage * const workspace, const sparse_matrix * const H,
+        sparse_matrix ** H_full, sparse_matrix ** H_p )
 {
-    int num_thread;
-
-    if ( color == NULL )
+    if ( *H_p == NULL )
     {
-#ifdef _OPENMP
-        #pragma omp parallel
+        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
         {
-            num_thread = omp_get_num_threads();
+            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
         }
-#else
-        num_thread = 1;
-#endif
-
-        /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
-        color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                         "setup_graph_coloring::color" );
-        to_color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                            "setup_graph_coloring::to_color" );
-        conflict = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                            "setup_graph_coloring::conflict" );
-        conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (num_thread + 1),
-                                                "setup_graph_coloring::conflict_cnt" );
-        recolor = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                           "setup_graph_coloring::recolor" );
-        color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (H->n + 1),
-                                             "setup_graph_coloring::color_top" );
-        permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                           "setup_graph_coloring::premuted_row_col" );
-        permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                               "setup_graph_coloring::premuted_row_col_inv" );
-        y_p = (real*) smalloc( sizeof(real) * H->n,
-                               "setup_graph_coloring::y_p" );
-        if ( (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
-                (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
-        {
-            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
+    }
+    else if ( (*H_p)->m < H->m )
+    {
+        Deallocate_Matrix( *H_p );
+        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
+        {
+            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
             exit( INSUFFICIENT_MEMORY );
         }
     }
 
-    compute_H_full( H );
+    compute_full_sparse_matrix( H, H_full );
 
-    graph_coloring( H_full, LOWER );
-    sort_colors( H_full->n, LOWER );
+    graph_coloring( control, workspace, *H_full, LOWER );
+    sort_rows_by_colors( workspace, (*H_full)->n, LOWER );
 
-    memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
-    memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
-    memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
-    permute_matrix( H_p, LOWER );
-
-    return H_p;
+    memcpy( (*H_p)->start, H->start, sizeof(int) * (H->n + 1) );
+    memcpy( (*H_p)->j, H->j, sizeof(int) * (H->start[H->n]) );
+    memcpy( (*H_p)->val, H->val, sizeof(real) * (H->start[H->n]) );
+    permute_matrix( workspace, (*H_p), LOWER );
 }
 
 
@@ -2926,13 +2843,13 @@ static void apply_preconditioner( const static_storage * const workspace, const
 #ifdef _OPENMP
                     #pragma omp single
 #endif
-                {
-                    memcpy( y_p, y, sizeof(real) * workspace->H->n );
-                }
+                    {
+                        memcpy( workspace->y_p, y, sizeof(real) * workspace->H->n );
+                    }
 
-                permute_vector( y_p, workspace->H->n, FALSE, LOWER );
-                tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre );
-                break;
+                    permute_vector( (static_storage *) workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+                    tri_solve_level_sched( workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -2975,7 +2892,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
                     }
 
                     jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
-                break;
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -3046,9 +2963,9 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ICHOLT_PC:
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
-                tri_solve_level_sched( workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
-                permute_vector( x, workspace->H->n, TRUE, UPPER );
-                break;
+                    tri_solve_level_sched( workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+                    permute_vector( (static_storage *) workspace, x, workspace->H->n, TRUE, UPPER );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -3067,31 +2984,31 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
 #ifdef _OPENMP
-                #pragma omp single
+                    #pragma omp single
 #endif
-                {
-                    if ( Dinv_U == NULL )
                     {
-                        Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n,
-                                                  "apply_preconditioner::Dinv_U" );
+                        if ( Dinv_U == NULL )
+                        {
+                            Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n,
+                                                      "apply_preconditioner::Dinv_U" );
+                        }
                     }
-                }
 
-                    /* construct D^{-1}_U */
-                if ( fresh_pre == TRUE )
-                {
+                        /* construct D^{-1}_U */
+                    if ( fresh_pre == TRUE )
+                    {
 #ifdef _OPENMP
-                    #pragma omp for schedule(static)
+                        #pragma omp for schedule(static)
 #endif
-                    for ( i = 0; i < workspace->U->n; ++i )
-                    {
-                        si = workspace->U->start[i];
-                        Dinv_U[i] = 1. / workspace->U->val[si];
+                        for ( i = 0; i < workspace->U->n; ++i )
+                        {
+                            si = workspace->U->start[i];
+                            Dinv_U[i] = 1. / workspace->U->val[si];
+                        }
                     }
-                }
 
-                jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
-                break;
+                    jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -3300,19 +3217,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
         }
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_orthog += t_ortho / control->num_threads;
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_orthog += t_ortho;
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_tri_solve += t_ts;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters )
     {
@@ -3546,19 +3455,11 @@ int GMRES_HouseHolder( const static_storage * const workspace,
         }
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_orthog += t_ortho / control->num_threads;
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_orthog += t_ortho;
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_tri_solve += t_ts;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters )
     {
@@ -3657,15 +3558,9 @@ int CG( const static_storage * const workspace, const control_params * const con
         g_itr = i;
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters )
     {
@@ -3760,15 +3655,9 @@ int SDM( const static_storage * const workspace, const control_params * const co
         g_itr = i;
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters  )
     {
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index 814701fe..164aa05a 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -81,7 +81,9 @@ void jacobi_iter( const sparse_matrix * const, const real * const,
         const real * const, real * const, const TRIANGULARITY,
         const unsigned int );
 
-sparse_matrix * setup_graph_coloring( sparse_matrix * const );
+void setup_graph_coloring( const control_params * const,
+        static_storage * const, const sparse_matrix * const,
+        sparse_matrix **, sparse_matrix ** );
 
 int GMRES( const static_storage * const, const control_params * const,
         simulation_data * const, const sparse_matrix * const,
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index e46d217c..24ed7fd3 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -617,9 +617,7 @@ typedef struct
     int num_ignored;
     int ignore[MAX_ATOM_TYPES];
 
-#ifdef _OPENMP
     int num_threads;
-#endif
 } control_params;
 
 
@@ -947,6 +945,7 @@ typedef struct
     sparse_matrix *H;
     sparse_matrix *H_full;
     sparse_matrix *H_sp;
+    sparse_matrix *H_p;
     sparse_matrix *H_spar_patt;
     sparse_matrix *H_spar_patt_full;
     sparse_matrix *H_app_inv;
@@ -971,12 +970,28 @@ typedef struct
     real **h;
     real **rn;
     real **v;
+
     /* CG related storage */
     real *r;
     real *d;
     real *q;
     real *p;
 
+    /* Graph coloring related storage for applying, e.g. ICHOLT and ILU(T),
+     * preconditioners */
+    unsigned int *color;
+    unsigned int *to_color;
+    unsigned int *conflict;
+    unsigned int *conflict_cnt;
+    unsigned int *temp_ptr;
+    unsigned int *recolor;
+    unsigned int recolor_cnt;
+    unsigned int *color_top;
+    unsigned int *permuted_row_col;
+    unsigned int *permuted_row_col_inv;
+    real *y_p;
+    real *x_p;
+
     int num_H;
     int *hbond_index; // for hydrogen bonds
 
-- 
GitLab