diff --git a/sPuReMD/Makefile.am b/sPuReMD/Makefile.am
index 2531185b87dbb0b8cd6b4ba80864e75265be43fd..92d61d1fd85e7a54ec316aa0b90e4731c93a720b 100644
--- a/sPuReMD/Makefile.am
+++ b/sPuReMD/Makefile.am
@@ -20,9 +20,23 @@ check_PROGRAMS =
 TESTS =
 
 if BUILD_TEST
+check_PROGRAMS += tests/test_spuremd
+check_PROGRAMS += tests/test_lin_alg
 check_PROGRAMS += tests/test_vector
 TESTS += $(check_PROGRAMS)
 
+tests_test_spuremd_SOURCES = tests/test_spuremd.cpp
+tests_test_spuremd_CPPFLAGS = -Isrc $(GTEST_CPPFLAGS)
+tests_test_spuremd_CXXFLAGS = $(GTEST_CXXFLAGS)
+tests_test_spuremd_LDFLAGS = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+tests_test_spuremd_LDADD = lib/libspuremd.la -lgtest
+
+tests_test_lin_alg_SOURCES = tests/test_lin_alg.cpp
+tests_test_lin_alg_CPPFLAGS = -Isrc $(GTEST_CPPFLAGS)
+tests_test_lin_alg_CXXFLAGS = $(GTEST_CXXFLAGS)
+tests_test_lin_alg_LDFLAGS = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+tests_test_lin_alg_LDADD = -lgtest
+
 tests_test_vector_SOURCES = tests/test_vector.cpp
 tests_test_vector_CPPFLAGS = -Isrc $(GTEST_CPPFLAGS)
 tests_test_vector_CXXFLAGS = $(GTEST_CXXFLAGS)
diff --git a/sPuReMD/src/allocate.c b/sPuReMD/src/allocate.c
index 15f425652cc274249f53e3ef6b5dbd8270d74fa7..6c07fe4f2e8ae5a25f4fc0108057d74a7b00b98e 100644
--- a/sPuReMD/src/allocate.c
+++ b/sPuReMD/src/allocate.c
@@ -269,7 +269,7 @@ void Reallocate( reax_system *system, control_params *control, static_storage *w
 
     if ( realloc->num_far > 0 && nbr_flag )
     {
-        Reallocate_Neighbor_List( (*lists) + FAR_NBRS,
+        Reallocate_Neighbor_List( &(*lists)[FAR_NBRS],
                 system->N, realloc->num_far * SAFE_ZONE );
         realloc->num_far = -1;
     }
@@ -287,29 +287,31 @@ void Reallocate( reax_system *system, control_params *control, static_storage *w
 
     if ( control->hb_cut > 0.0 && realloc->hbonds > 0 )
     {
-        Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-                               (*lists) + HBONDS );
+        Reallocate_HBonds_List( system->N, workspace->num_H, workspace->hbond_index,
+                &(*lists)[HBONDS] );
         realloc->hbonds = -1;
     }
 
     num_bonds = est_3body = -1;
     if ( realloc->bonds > 0 )
     {
-        Reallocate_Bonds_List( system->N, (*lists) + BONDS, &num_bonds, &est_3body );
+        Reallocate_Bonds_List( system->N, &(*lists)[BONDS], &num_bonds, &est_3body );
         realloc->bonds = -1;
         realloc->num_3body = MAX( realloc->num_3body, est_3body );
     }
 
     if ( realloc->num_3body > 0 )
     {
-        Delete_List( TYP_THREE_BODY, (*lists) + THREE_BODIES );
+        Delete_List( TYP_THREE_BODY, &(*lists)[THREE_BODIES] );
 
         if ( num_bonds == -1 )
-            num_bonds = ((*lists) + BONDS)->total_intrs;
+        {
+            num_bonds = (*lists)[BONDS].total_intrs;
+        }
         realloc->num_3body *= SAFE_ZONE;
 
         Make_List( num_bonds, realloc->num_3body,
-                TYP_THREE_BODY, (*lists) + THREE_BODIES );
+                TYP_THREE_BODY, &(*lists)[THREE_BODIES] );
         realloc->num_3body = -1;
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "reallocating 3 bodies\n" );
diff --git a/sPuReMD/src/analyze.c b/sPuReMD/src/analyze.c
index 675e900718cdf06c193cc1e7a7bfd6f1aad90ff1..1b260eea95d5e16ab443a8ba3d5ccae8a749e22a 100644
--- a/sPuReMD/src/analyze.c
+++ b/sPuReMD/src/analyze.c
@@ -64,8 +64,8 @@ void Copy_Bond_List( reax_system *system, control_params *control,
                      reax_list **lists )
 {
     int i, j, top_old;
-    reax_list *new_bonds = (*lists) + BONDS;
-    reax_list *old_bonds = (*lists) + OLD_BONDS;
+    reax_list *new_bonds = &(*lists)[BONDS];
+    reax_list *old_bonds = &(*lists)[OLD_BONDS];
 
     for ( top_old = 0, i = 0; i < system->N; ++i )
     {
@@ -94,8 +94,8 @@ void Copy_Bond_List( reax_system *system, control_params *control,
 int Compare_Bond_Lists( int atom, control_params *control, reax_list **lists )
 {
     int oldp, newp;
-    reax_list *new_bonds = (*lists) + BONDS;
-    reax_list *old_bonds = (*lists) + OLD_BONDS;
+    reax_list *new_bonds = &(*lists)[BONDS];
+    reax_list *old_bonds = &(*lists)[OLD_BONDS];
 
     /*fprintf( stdout, "\n%d\nold_bonds:", atom );
       for( oldp = Start_Index( atom, old_bonds );
@@ -229,8 +229,8 @@ void Analyze_Molecules( reax_system *system, control_params *control,
     int *old_mark = workspace->old_mark;
     int num_old, num_new;
     char s[MAX_MOLECULE_SIZE * 10];
-    reax_list *new_bonds = (*lists) + BONDS;
-    reax_list *old_bonds = (*lists) + OLD_BONDS;
+    reax_list *new_bonds = &(*lists)[BONDS];
+    reax_list *old_bonds = &(*lists)[OLD_BONDS];
     molecule old_molecules[20], new_molecules[20];
 
     fprintf( fout, "molecular analysis @ %d\n", data->step );
@@ -543,8 +543,8 @@ void Analyze_Fragments( reax_system *system, control_params *control,
     char fragments[MAX_FRAGMENT_TYPES][MAX_ATOM_TYPES];
     int fragment_count[MAX_FRAGMENT_TYPES];
     molecule m;
-    reax_list *new_bonds = (*lists) + BONDS;
-    //reax_list *old_bonds = (*lists) + OLD_BONDS;
+    reax_list *new_bonds = &(*lists)[BONDS];
+//    reax_list *old_bonds = &(*lists)[OLD_BONDS];
 
     /* fragment analysis */
     fprintf( fout, "step%d fragments\n", data->step );
@@ -609,14 +609,16 @@ void Analyze_Silica( reax_system *system, control_params *control,
     int O_SI_O_count, SI_O_SI_count;
     int si_coord[10], ox_coord[10];
     real O_SI_O, SI_O_SI;
-    reax_list *new_bonds = (*lists) + BONDS;
-    reax_list *thb_intrs =  (*lists) + THREE_BODIES;
+    reax_list *new_bonds = &(*lists)[BONDS];
+    reax_list *thb_intrs = &(*lists)[THREE_BODIES];
 
     Analyze_Fragments( system, control, data, workspace, lists, fout, 0 );
 
     /* analyze atom coordinations */
     for ( i = 0; i < 10; ++i )
+    {
         si_coord[i] = ox_coord[i] = 0;
+    }
 
     for ( atom = 0; atom < system->N; ++atom )
     {
@@ -624,8 +626,12 @@ void Analyze_Silica( reax_system *system, control_params *control,
 
         for ( newp = Start_Index( atom, new_bonds );
                 newp < End_Index( atom, new_bonds ); ++newp )
+        {
             if ( new_bonds->select.bond_list[newp].bo_data.BO >= control->bg_cut )
+            {
                 ++coord;
+            }
+        }
 
         if ( system->atoms[ atom ].type == SI_ATOM )
         {
@@ -718,11 +724,12 @@ void Analyze_Silica( reax_system *system, control_params *control,
 }
 
 
-
 int Get_Type_of_Molecule( molecule *m )
 {
     if ( m->atom_count == 3 && m->mtypes[1] == 2 && m->mtypes[2] == 1 )
+    {
         return WATER;
+    }
 
     return UNKNOWN;
 }
@@ -982,7 +989,7 @@ void Analysis( reax_system *system, control_params *control,
     /****** Electric Dipole Moment ******/
     if ( control->dipole_anal && steps % control->freq_dipole_anal == 0 )
         Calculate_Dipole_Moment( system, control, data, workspace,
-                                 (*lists) + BONDS, out_control->dpl );
+                &(*lists)[BONDS], out_control->dpl );
 
     /****** Drift ******/
     if ( control->diffusion_coef && steps % control->freq_diffusion_coef == 0 )
diff --git a/sPuReMD/src/bond_orders.c b/sPuReMD/src/bond_orders.c
index 52d5ad6162adbbdd2f5bb7e09f6259c5a1a57d51..d8f71ee014c4d6b072476a4e5fe1138fec31e425 100644
--- a/sPuReMD/src/bond_orders.c
+++ b/sPuReMD/src/bond_orders.c
@@ -37,10 +37,12 @@ static inline real Cf45( real p1, real p2 )
 void Get_dBO( reax_system *system, reax_list **lists,
         int i, int pj, real C, rvec *v )
 {
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs = (*lists) + DBO;
+    reax_list *bonds;
+    reax_list *dBOs;
     int start_pj, end_pj, k;
 
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     pj = bonds->select.bond_list[pj].dbond_index;
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
@@ -56,11 +58,13 @@ void Get_dBO( reax_system *system, reax_list **lists,
 void Get_dBOpinpi2( reax_system *system, reax_list **lists,
         int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
 {
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs = (*lists) + DBO;
+    reax_list *bonds;
+    reax_list *dBOs;
     dbond_data *dbo_k;
     int start_pj, end_pj, k;
 
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     pj = bonds->select.bond_list[pj].dbond_index;
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
@@ -77,10 +81,12 @@ void Get_dBOpinpi2( reax_system *system, reax_list **lists,
 void Add_dBO( reax_system *system, reax_list **lists,
         int i, int pj, real C, rvec *v )
 {
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs = (*lists) + DBO;
+    reax_list *bonds;
+    reax_list *dBOs;
     int start_pj, end_pj, k;
 
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     pj = bonds->select.bond_list[pj].dbond_index;
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
@@ -98,11 +104,13 @@ void Add_dBO( reax_system *system, reax_list **lists,
 void Add_dBOpinpi2( reax_system *system, reax_list **lists,
         int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
 {
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs = (*lists) + DBO;
+    reax_list *bonds;
+    reax_list *dBOs;
     dbond_data *dbo_k;
     int start_pj, end_pj, k;
 
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     pj = bonds->select.bond_list[pj].dbond_index;
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
@@ -119,10 +127,12 @@ void Add_dBOpinpi2( reax_system *system, reax_list **lists,
 void Add_dBO_to_Forces( reax_system *system, reax_list **lists,
                         int i, int pj, real C )
 {
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs = (*lists) + DBO;
+    reax_list *bonds;
+    reax_list *dBOs;
     int start_pj, end_pj, k;
 
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     pj = bonds->select.bond_list[pj].dbond_index;
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
@@ -138,11 +148,13 @@ void Add_dBO_to_Forces( reax_system *system, reax_list **lists,
 void Add_dBOpinpi2_to_Forces( reax_system *system, reax_list **lists,
         int i, int pj, real Cpi, real Cpi2 )
 {
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs = (*lists) + DBO;
+    reax_list *bonds;
+    reax_list *dBOs;
     dbond_data *dbo_k;
     int start_pj, end_pj, k;
 
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     pj = bonds->select.bond_list[pj].dbond_index;
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
@@ -201,8 +213,8 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, reax_list **lists,
     dbond_data *top_dbo;
 
     /* Initializations */
-    bonds = (*lists) + BONDS;
-    dBOs = (*lists) + DBO;
+    bonds = &(*lists)[BONDS];
+    dBOs = &(*lists)[DBO];
     j = bonds->select.bond_list[pj].nbr;
     bo_ij = &(bonds->select.bond_list[pj].bo_data);
     start_i = Start_Index( i, bonds );
@@ -333,7 +345,7 @@ void Calculate_dBO( int i, int pj, static_storage *workspace, reax_list **lists,
 void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
         simulation_data *data, static_storage *workspace, reax_list **lists )
 {
-    reax_list *bonds = (*lists) + BONDS;
+    reax_list *bonds;
     bond_data *nbr_j, *nbr_k;
     bond_order_data *bo_ij, *bo_ji;
     dbond_coefficients coef;
@@ -346,6 +358,7 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
 #endif
 
     /* Initializations */
+    bonds = &(*lists)[BONDS];
     nbr_j = &(bonds->select.bond_list[pj]);
     j = nbr_j->nbr;
     bo_ij = &(nbr_j->bo_data);
@@ -523,7 +536,7 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
 void Add_dBond_to_Forces( int i, int pj, reax_system *system,
         simulation_data *data, static_storage *workspace, reax_list **lists )
 {
-    reax_list *bonds = (*lists) + BONDS;
+    reax_list *bonds;
     bond_data *nbr_j, *nbr_k;
     bond_order_data *bo_ij, *bo_ji;
     dbond_coefficients coef;
@@ -534,6 +547,7 @@ void Add_dBond_to_Forces( int i, int pj, reax_system *system,
 #endif
 
     /* Initializations */
+    bonds = &(*lists)[BONDS];
     nbr_j = &(bonds->select.bond_list[pj]);
     j = nbr_j->nbr;
     bo_ij = &(nbr_j->bo_data);
@@ -728,7 +742,7 @@ void Calculate_Bond_Orders( reax_system *system, control_params *control,
     p_lp1 = system->reaxprm.gp.l[15];
     p_boc1 = system->reaxprm.gp.l[0];
     p_boc2 = system->reaxprm.gp.l[1];
-    bonds = (*lists) + BONDS;
+    bonds = &(*lists)[BONDS];
 
 #ifdef _OPENMP
     #pragma omp parallel default(shared)
@@ -758,8 +772,8 @@ void Calculate_Bond_Orders( reax_system *system, control_params *control,
 
         top_dbo = 0;
         top_dDelta = 0;
-        dDeltas = (*lists) + DDELTA;
-        dBOs = (*lists) + DBO;
+        dDeltas = &(*lists)[DDELTA];
+        dBOs = &(*lists)[DBO];
 #endif
 
         /* Calculate Deltaprime, Deltaprime_boc values */
diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 61c5c8a702cacfc14123761756e81ab8717227aa..5c2d3bed963b38ea951da040cefe96b8dc9b14de 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -146,26 +146,19 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
         Hptr = workspace->H;
     }
 
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
     time = Get_Time( );
     if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
     {
-        if ( control->cm_domain_sparsify_enabled == TRUE )
-        {
-            Hptr = setup_graph_coloring( workspace->H_sp );
-        }
-        else
-        {
-            Hptr = setup_graph_coloring( workspace->H );
-        }
-
-        Sort_Matrix_Rows( Hptr );
+        setup_graph_coloring( control, workspace, Hptr, &workspace->H_full, &workspace->H_p );
+        Sort_Matrix_Rows( workspace->H_p );
+        Hptr = workspace->H_p;
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
 
-#if defined(TEST_MAT)
-    Hptr = create_test_mat( );
-#endif
-
     switch ( control->cm_solver_pre_comp_type )
     {
         case NONE_PC:
@@ -503,27 +496,25 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
         Hptr = workspace->H;
     }
 
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    }
+
     time = Get_Time( );
     if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
     {
-        if ( control->cm_domain_sparsify_enabled == TRUE )
-        {
-            Hptr = setup_graph_coloring( workspace->H_sp );
-        }
-        else
-        {
-            Hptr = setup_graph_coloring( workspace->H );
-        }
-
-        Sort_Matrix_Rows( Hptr );
+        setup_graph_coloring( control, workspace, Hptr, &workspace->H_full, &workspace->H_p );
+        Sort_Matrix_Rows( workspace->H_p );
+        Hptr = workspace->H_p;
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
-
-#if defined(TEST_MAT)
-    Hptr = create_test_mat( );
-#endif
-
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
     
     switch ( control->cm_solver_pre_comp_type )
     {
@@ -578,7 +569,24 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
             break;
     }
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = workspace->H_sp;
+        }
+        else
+        {
+            Hptr = workspace->H;
+        }
+    }
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    }
 
 #if defined(DEBUG)
 #define SIZE (1000)
@@ -620,28 +628,26 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         Hptr = workspace->H;
     }
 
+#if defined(TEST_MAT)
+    Hptr = create_test_mat( );
+#endif
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+        Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+    }
+
     time = Get_Time( );
     if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
     {
-        if ( control->cm_domain_sparsify_enabled == TRUE )
-        {
-            Hptr = setup_graph_coloring( workspace->H_sp );
-        }
-        else
-        {
-            Hptr = setup_graph_coloring( workspace->H );
-        }
-
-        Sort_Matrix_Rows( Hptr );
+        setup_graph_coloring( control, workspace, Hptr, &workspace->H_full, &workspace->H_p );
+        Sort_Matrix_Rows( workspace->H_p );
+        Hptr = workspace->H_p;
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
-
-#if defined(TEST_MAT)
-    Hptr = create_test_mat( );
-#endif
-
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
-    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
     
     switch ( control->cm_solver_pre_comp_type )
     {
@@ -696,8 +702,25 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
             break;
     }
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
-    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        if ( control->cm_domain_sparsify_enabled == TRUE )
+        {
+            Hptr = workspace->H_sp;
+        }
+        else
+        {
+            Hptr = workspace->H;
+        }
+    }
+
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+        Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+    }
 
 #if defined(DEBUG)
 #define SIZE (1000)
@@ -748,10 +771,6 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
 
-#if defined(DEBUG)
-    fprintf( stderr, "H matrix sorted\n" );
-#endif
-
     switch ( control->cm_solver_pre_comp_type )
     {
         case NONE_PC:
@@ -768,18 +787,8 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
         case ICHOLT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "drop tolerances calculated\n" );
-#endif
-
             fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
 
-#if defined(DEBUG)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                     fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
-#endif
-
             if ( workspace->L == NULL )
             {
                 if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE ||
@@ -816,10 +825,6 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
         case ILUT_PAR_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "drop tolerances calculated\n" );
-#endif
-
             if ( workspace->L == NULL )
             {
                 /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
@@ -896,11 +901,12 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
-
-#if defined(DEBUG)
-    fprintf( stderr, "H matrix sorted\n" );
-#endif
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+    }
 
     switch ( control->cm_solver_pre_comp_type )
     {
@@ -918,18 +924,8 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
         case ICHOLT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "drop tolerances calculated\n" );
-#endif
-
             fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
 
-#if defined(DEBUG)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                     fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
-#endif
-
             if ( workspace->L == NULL )
             {
                 if ( Allocate_Matrix( &(workspace->L), system->N_cm, fillin + system->N_cm ) == FAILURE ||
@@ -966,10 +962,6 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
         case ILUT_PAR_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "drop tolerances calculated\n" );
-#endif
-
             if ( workspace->L == NULL )
             {
                 /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
@@ -1015,7 +1007,12 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
             break;
     }
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+    }
 }
 
 
@@ -1048,12 +1045,13 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
     }
     data->timing.cm_sort_mat_rows += Get_Timing_Info( time );
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
-    Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
-
-#if defined(DEBUG)
-    fprintf( stderr, "H matrix sorted\n" );
-#endif
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0;
+        Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0;
+    }
 
     switch ( control->cm_solver_pre_comp_type )
     {
@@ -1071,18 +1069,8 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
         case ICHOLT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "drop tolerances calculated\n" );
-#endif
-
             fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
 
-#if defined(DEBUG)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                     fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) );
-#endif
-
             if ( workspace->L == NULL )
             {
                 if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE ||
@@ -1118,10 +1106,6 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
         case ILUT_PAR_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol );
 
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "drop tolerances calculated\n" );
-#endif
-
             if ( workspace->L == NULL )
             {
                 /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */
@@ -1167,8 +1151,13 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
             break;
     }
 
-    Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
-    Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+    if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
+            control->cm_solver_pre_comp_type == ILU_PAR_PC ||
+            control->cm_solver_pre_comp_type == ILUT_PAR_PC )
+    {
+        Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0;
+        Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0;
+    }
 }
 
 
@@ -1292,10 +1281,6 @@ static void QEq( reax_system * const system, control_params * const control,
 
     data->timing.cm_solver_iters += iters;
 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "linsolve-" );
-#endif
-
     Calculate_Charges_QEq( system, workspace );
 
 #if defined(DEBUG_FOCUS)
@@ -1372,10 +1357,6 @@ static void EE( reax_system * const system, control_params * const control,
 
     data->timing.cm_solver_iters += iters;
 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "linsolve-" );
-#endif
-
     Calculate_Charges_EE( system, workspace );
 
     // if( data->step == control->nsteps )
@@ -1446,10 +1427,6 @@ static void ACKS2( reax_system * const system, control_params * const control,
 
     data->timing.cm_solver_iters += iters;
 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "linsolve-" );
-#endif
-
     Calculate_Charges_EE( system, workspace );
 }
 
diff --git a/sPuReMD/src/forces.c b/sPuReMD/src/forces.c
index 86b6ebce8310e4a8c044bae6aed709c863e565f8..24e5c2914a3e76b365a7dbf10acab4ee677f05b3 100644
--- a/sPuReMD/src/forces.c
+++ b/sPuReMD/src/forces.c
@@ -150,7 +150,7 @@ void Compute_NonBonded_Forces( reax_system *system, control_params *control,
     fprintf( stderr, "qeq - " );
 #endif
 
-    if ( control->tabulate == 0)
+    if ( control->tabulate <= 0 )
     {
         vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
     }
@@ -179,7 +179,7 @@ void Compute_Total_Force( reax_system *system, control_params *control,
     int i;
     reax_list *bonds;
 
-    bonds = (*lists) + BONDS;
+    bonds = &(*lists)[BONDS];
 
 #ifdef _OPENMP
     #pragma omp parallel default(shared)
@@ -233,8 +233,8 @@ void Validate_Lists( static_storage *workspace, reax_list **lists, int step, int
     int i, flag;
     reax_list *bonds, *hbonds;
 
-    bonds = *lists + BONDS;
-    hbonds = *lists + HBONDS;
+    bonds = &(*lists)[BONDS];
+    hbonds = &(*lists)[HBONDS];
 
     /* far neighbors */
     if ( Htop > Hmax * DANGER_ZONE )
@@ -625,23 +625,24 @@ void Init_Forces( reax_system *system, control_params *control,
     bond_data *ibond, *jbond;
     bond_order_data *bo_ij, *bo_ji;
 
-    far_nbrs = *lists + FAR_NBRS;
-    bonds = *lists + BONDS;
-    hbonds = *lists + HBONDS;
+    far_nbrs = &(*lists)[FAR_NBRS];
+    bonds = &(*lists)[BONDS];
+    hbonds = &(*lists)[HBONDS];
     H = workspace->H;
     H_sp = workspace->H_sp;
     Htop = 0;
     H_sp_top = 0;
     num_bonds = 0;
     num_hbonds = 0;
-    btop_i = btop_j = 0;
+    btop_i = 0;
+    btop_j = 0;
 
     for ( i = 0; i < system->N; ++i )
     {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
         H->start[i] = Htop;
         H_sp->start[i] = H_sp_top;
         btop_i = End_Index( i, bonds );
@@ -661,7 +662,7 @@ void Init_Forces( reax_system *system, control_params *control,
 
             flag = 0;
             flag_sp = 0;
-            if ((data->step - data->prev_steps) % control->reneighbor == 0)
+            if ( (data->step - data->prev_steps) % control->reneighbor == 0 )
             {
                 if ( nbr_pj->d <= control->r_cut )
                 {
@@ -678,7 +679,7 @@ void Init_Forces( reax_system *system, control_params *control,
                 }
             }
             else if ((nbr_pj->d = Sq_Distance_on_T3(atom_i->x, atom_j->x, &(system->box),
-                                                    nbr_pj->dvec)) <= SQR(control->r_cut))
+                            nbr_pj->dvec)) <= SQR(control->r_cut))
             {
                 if ( nbr_pj->d <= SQR(control->r_sp_cut))
                 {
@@ -738,7 +739,7 @@ void Init_Forces( reax_system *system, control_params *control,
                 {
                     r2 = SQR( r_ij );
 
-                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
                     {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
@@ -749,7 +750,7 @@ void Init_Forces( reax_system *system, control_params *control,
                         C12 = 0.0;
                     }
 
-                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
                     {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
@@ -760,7 +761,7 @@ void Init_Forces( reax_system *system, control_params *control,
                         C34 = 0.0;
                     }
 
-                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
                     {
                         C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
                         BO_pi2 = EXP( C56 );
@@ -937,31 +938,34 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
     bond_data *ibond, *jbond;
     bond_order_data *bo_ij, *bo_ji;
 
-    far_nbrs = *lists + FAR_NBRS;
-    bonds = *lists + BONDS;
-    hbonds = *lists + HBONDS;
-
+    far_nbrs = &(*lists)[FAR_NBRS];
+    bonds = &(*lists)[BONDS];
+    hbonds = &(*lists)[HBONDS];
     H = workspace->H;
     H_sp = workspace->H_sp;
     Htop = 0;
     H_sp_top = 0;
     num_bonds = 0;
     num_hbonds = 0;
-    btop_i = btop_j = 0;
+    btop_i = 0;
+    btop_j = 0;
 
     for ( i = 0; i < system->N; ++i )
     {
         atom_i = &(system->atoms[i]);
         type_i  = atom_i->type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
+        start_i = Start_Index( i, far_nbrs );
+        end_i = End_Index( i, far_nbrs );
         H->start[i] = Htop;
         H_sp->start[i] = H_sp_top;
         btop_i = End_Index( i, bonds );
         sbp_i = &(system->reaxprm.sbp[type_i]);
         ihb = ihb_top = -1;
+
         if ( control->hb_cut > 0 && (ihb = sbp_i->p_hbond) == 1 )
+        {
             ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+        }
 
         for ( pj = start_i; pj < end_i; ++pj )
         {
@@ -971,7 +975,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
             flag = 0;
             flag_sp = 0;
-            if ((data->step - data->prev_steps) % control->reneighbor == 0)
+            if ( (data->step - data->prev_steps) % control->reneighbor == 0 )
             {
                 if (nbr_pj->d <= control->r_cut)
                 {
@@ -988,7 +992,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                 }
             }
             else if ((nbr_pj->d = Sq_Distance_on_T3(atom_i->x, atom_j->x, &(system->box),
-                                                    nbr_pj->dvec)) <= SQR(control->r_cut))
+                            nbr_pj->dvec)) <= SQR(control->r_cut))
             {
                 if ( nbr_pj->d <= SQR(control->r_sp_cut))
                 {
@@ -1006,8 +1010,8 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                 twbp = &(system->reaxprm.tbp[type_i][type_j]);
 
                 H->j[Htop] = j;
-                H->val[Htop] = Init_Charge_Matrix_Entry_Tab( system, control,
-                        workspace->LR, i, j, r_ij, OFF_DIAGONAL );
+                H->val[Htop] = Init_Charge_Matrix_Entry( system, control, i, j,
+                        r_ij, OFF_DIAGONAL );
                 ++Htop;
 
                 /* H_sp matrix entry */
@@ -1048,7 +1052,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                 {
                     r2 = SQR( r_ij );
 
-                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0)
+                    if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 )
                     {
                         C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
                         BO_s = (1.0 + control->bo_cut) * EXP( C12 );
@@ -1059,7 +1063,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         C12 = 0.0;
                     }
 
-                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0)
+                    if ( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0 )
                     {
                         C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
                         BO_pi = EXP( C34 );
@@ -1070,7 +1074,7 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         C34 = 0.0;
                     }
 
-                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0)
+                    if ( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0 )
                     {
                         C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
                         BO_pi2 = EXP( C56 );
@@ -1097,7 +1101,6 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
                         ibond->d = r_ij;
                         jbond->d = r_ij;
                         rvec_Copy( ibond->dvec, nbr_pj->dvec );
-                        //fprintf (stderr, " %f - %f - %f \n", nbr_pj->dvec[0], nbr_pj->dvec[1], nbr_pj->dvec[2]);
                         rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
                         ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
                         ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
@@ -1162,8 +1165,8 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
 
         /* diagonal entry */
         H->j[Htop] = i;
-        H->val[Htop] = Init_Charge_Matrix_Entry_Tab( system, control, workspace->LR,
-                i, j, r_ij, DIAGONAL );
+        H->val[Htop] = Init_Charge_Matrix_Entry( system, control, i, i,
+                r_ij, DIAGONAL );
         ++Htop;
 
         H_sp->j[H_sp_top] = i;
@@ -1177,12 +1180,16 @@ void Init_Forces_Tab( reax_system *system, control_params *control,
         }
     }
 
+    Init_Charge_Matrix_Remaining_Entries( system, control, far_nbrs,
+            H, H_sp, &Htop, &H_sp_top );
+
     // mark the end of j list
-    H->start[i] = Htop;
-    H_sp->start[i] = H_sp_top;
+    H->start[system->N_cm] = Htop;
+    H_sp->start[system->N_cm] = H_sp_top;
+
     /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists, data->step, system->N,
-            H->m, Htop, num_bonds, num_hbonds );
+    Validate_Lists( workspace, lists,
+            data->step, system->N, H->m, Htop, num_bonds, num_hbonds );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n",
@@ -1304,7 +1311,7 @@ void Compute_Forces( reax_system *system, control_params *control,
     real t_start, t_elapsed;
 
     t_start = Get_Time( );
-    if ( !control->tabulate )
+    if ( control->tabulate <= 0 )
     {
         Init_Forces( system, control, data, workspace, lists, out_control );
     }
diff --git a/sPuReMD/src/four_body_interactions.c b/sPuReMD/src/four_body_interactions.c
index 26a15b7499a0da34f01651e2b7b36822008aa477..22d104763a7aaaca6c97dfed1619ff96e11672a3 100644
--- a/sPuReMD/src/four_body_interactions.c
+++ b/sPuReMD/src/four_body_interactions.c
@@ -164,8 +164,8 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
     p_tor3 = system->reaxprm.gp.l[24];
     p_tor4 = system->reaxprm.gp.l[25];
     p_cot2 = system->reaxprm.gp.l[27];
-    bonds = (*lists) + BONDS;
-    thb_intrs = (*lists) + THREE_BODIES;
+    bonds = &(*lists)[BONDS];
+    thb_intrs = &(*lists)[THREE_BODIES];
     e_tor_total = 0.0;
     e_con_total = 0.0;
 
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index 01b0fc5593bbdb562b374b86227764edbfbc553e..c809655fa4b9796f09122388657b05229f5f96ad 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -335,11 +335,14 @@ void Init_Workspace( reax_system *system, control_params *control,
             break;
     }
 
+    /* sparse matrices */
     workspace->H = NULL;
+    workspace->H_full = NULL;
     workspace->H_sp = NULL;
-    workspace->L = NULL;
+    workspace->H_p = NULL;
     workspace->H_spar_patt = NULL;
     workspace->H_app_inv = NULL;
+    workspace->L = NULL;
     workspace->U = NULL;
     workspace->Hdia_inv = NULL;
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
@@ -348,6 +351,7 @@ void Init_Workspace( reax_system *system, control_params *control,
         workspace->droptol = (real *) scalloc( system->N_cm, sizeof( real ),
                 "Init_Workspace::workspace->droptol" );
     }
+
     //TODO: check if unused
     //workspace->w        = (real *) scalloc( cm_lin_sys_size, sizeof( real ),
     //"Init_Workspace::workspace->droptol" );
@@ -497,6 +501,106 @@ void Init_Workspace( reax_system *system, control_params *control,
             break;
     }
 
+    /* SpMV related */
+#ifdef _OPENMP
+    workspace->b_local = (real*) smalloc( control->num_threads * system->N_cm * sizeof(real),
+            "Init_Workspace::b_local" );
+#endif
+
+    /* level scheduling related */
+    workspace->levels_L = 1;
+    workspace->levels_U = 1;
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_LEVEL_SCHED_PA ||
+            control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        workspace->row_levels_L = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int),
+                "Init_Workspace::row_levels_L" );
+        workspace->level_rows_L = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int),
+                "Init_Workspace::level_rows_L" );
+        workspace->level_rows_cnt_L = (unsigned int*) smalloc( (system->N_cm + 1) * sizeof(unsigned int),
+                "Init_Workspace::level_rows_cnt_L" );
+        workspace->row_levels_U = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int),
+                "Init_Workspace::row_levels_U" );
+        workspace->level_rows_U = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int),
+                "Init_Workspace::level_rows_U" );
+        workspace->level_rows_cnt_U = (unsigned int*) smalloc( (system->N_cm + 1) * sizeof(unsigned int),
+                "Init_Workspace::level_rows_cnt_U" );
+        workspace->top = (unsigned int*) smalloc( (system->N_cm + 1) * sizeof(unsigned int),
+                "Init_Workspace::top" );
+    }
+    else
+    {
+        workspace->row_levels_L = NULL;
+        workspace->level_rows_L = NULL;
+        workspace->level_rows_cnt_L = NULL;
+        workspace->row_levels_U = NULL;
+        workspace->level_rows_U = NULL;
+        workspace->level_rows_cnt_U = NULL;
+        workspace->top = NULL;
+    }
+
+    /* graph coloring related */
+    workspace->recolor_cnt = 0;
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        workspace->color = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                "Init_Workspace::color" );
+        workspace->to_color = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                "Init_Workspace::to_color" );
+        workspace->conflict = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                "setup_graph_coloring::conflict" );
+        workspace->conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (control->num_threads + 1),
+                "Init_Workspace::conflict_cnt" );
+        workspace->recolor = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                "Init_Workspace::recolor" );
+        workspace->color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (system->N_cm + 1),
+                "Init_Workspace::color_top" );
+        workspace->permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                "Init_Workspace::premuted_row_col" );
+        workspace->permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm,
+                "Init_Workspace::premuted_row_col_inv" );
+        workspace->y_p = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::y_p" );
+        workspace->x_p = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::x_p" );
+    }
+    else
+    {
+        workspace->color = NULL;
+        workspace->to_color = NULL;
+        workspace->conflict = NULL;
+        workspace->conflict_cnt = NULL;
+        workspace->recolor = NULL;
+        workspace->color_top = NULL;
+        workspace->permuted_row_col = NULL;
+        workspace->permuted_row_col_inv = NULL;
+        workspace->y_p = NULL;
+        workspace->x_p = NULL;
+    }
+
+    /* Jacobi iteration related */
+    if ( control->cm_solver_pre_app_type == JACOBI_ITER_PA )
+    {
+        workspace->Dinv_L = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::Dinv_L" );
+        workspace->Dinv_U = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::Dinv_U" );
+        workspace->Dinv_b = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::Dinv_b" );
+        workspace->rp = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::rp" );
+        workspace->rp2 = (real*) smalloc( sizeof(real) * system->N_cm,
+                "Init_Workspace::rp2" );
+    }
+    else
+    {
+        workspace->Dinv_L = NULL;
+        workspace->Dinv_U = NULL;
+        workspace->Dinv_b = NULL;
+        workspace->rp = NULL;
+        workspace->rp2 = NULL;
+    }
+
     /* integrator storage */
     workspace->a = (rvec *) smalloc( system->N * sizeof( rvec ),
            "Init_Workspace::workspace->a" );
@@ -590,7 +694,7 @@ void Init_Lists( reax_system *system, control_params *control,
 
     num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
 
-    Make_List( system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists) + FAR_NBRS );
+    Make_List( system->N, num_nbrs, TYP_FAR_NEIGHBOR, &(*lists)[FAR_NBRS] );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n",
@@ -669,7 +773,7 @@ void Init_Lists( reax_system *system, control_params *control,
         else
         {
             Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index,
-                    hb_top, (*lists) + HBONDS );
+                    hb_top, &(*lists)[HBONDS] );
         }
 
 #if defined(DEBUG_FOCUS)
@@ -681,7 +785,7 @@ void Init_Lists( reax_system *system, control_params *control,
     }
 
     /* bonds list */
-    Allocate_Bond_List( system->N, bond_top, (*lists) + BONDS );
+    Allocate_Bond_List( system->N, bond_top, &(*lists)[BONDS] );
     num_bonds = bond_top[system->N - 1];
 
 #if defined(DEBUG_FOCUS)
@@ -691,7 +795,7 @@ void Init_Lists( reax_system *system, control_params *control,
 #endif
 
     /* 3bodies list */
-    Make_List( num_bonds, num_3body, TYP_THREE_BODY, (*lists) + THREE_BODIES );
+    Make_List( num_bonds, num_3body, TYP_THREE_BODY, &(*lists)[THREE_BODIES] );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
@@ -700,9 +804,9 @@ void Init_Lists( reax_system *system, control_params *control,
 #endif
 
 #ifdef TEST_FORCES
-    Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA );
+    Make_List( system->N, num_bonds * 8, TYP_DDELTA, &(*lists)[DDELTA] );
 
-    Make_List( num_bonds, num_bonds * MAX_BONDS * 3, TYP_DBO, (*lists) + DBO );
+    Make_List( num_bonds, num_bonds * MAX_BONDS * 3, TYP_DBO, &(*lists)[DBO] );
 #endif
 
     sfree( hb_top, "Init_Lists::hb_top" );
@@ -949,11 +1053,13 @@ void Initialize( reax_system *system, control_params *control,
 #endif
 
 #ifdef _OPENMP
-    #pragma omp parallel default(shared)
+    #pragma omp parallel default(none) shared(control)
     {
         #pragma omp single
         control->num_threads = omp_get_num_threads( );
     }
+#else
+    control->num_threads = 1;
 #endif
 
     Randomize( );
@@ -1086,6 +1192,11 @@ void Finalize_Workspace( reax_system *system, control_params *control,
         Deallocate_Matrix( workspace->H_spar_patt_full );
         Deallocate_Matrix( workspace->H_app_inv );
     }
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        Deallocate_Matrix( workspace->H_full );
+        Deallocate_Matrix( workspace->H_p );
+    }
 
     for ( i = 0; i < 5; ++i )
     {
@@ -1160,6 +1271,49 @@ void Finalize_Workspace( reax_system *system, control_params *control,
             break;
     }
 
+    /* SpMV related */
+#ifdef _OPENMP
+    sfree( workspace->b_local, "Finalize_Workspace::b_local" );
+#endif
+
+    /* level scheduling related */
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_LEVEL_SCHED_PA ||
+            control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        sfree( workspace->row_levels_L, "Finalize_Workspace::row_levels_L" );
+        sfree( workspace->level_rows_L, "Finalize_Workspace::level_rows_L" );
+        sfree( workspace->level_rows_cnt_L, "Finalize_Workspace::level_rows_cnt_L" );
+        sfree( workspace->row_levels_U, "Finalize_Workspace::row_levels_U" );
+        sfree( workspace->level_rows_U, "Finalize_Workspace::level_rows_U" );
+        sfree( workspace->level_rows_cnt_U, "Finalize_Workspace::level_rows_cnt_U" );
+        sfree( workspace->top, "Finalize_Workspace::top" );
+    }
+
+    /* graph coloring related */
+    if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA )
+    {
+        sfree( workspace->color, "Finalize_Workspace::workspace->color" );
+        sfree( workspace->to_color, "Finalize_Workspace::workspace->to_color" );
+        sfree( workspace->conflict, "Finalize_Workspace::workspace->conflict" );
+        sfree( workspace->conflict_cnt, "Finalize_Workspace::workspace->conflict_cnt" );
+        sfree( workspace->recolor, "Finalize_Workspace::workspace->recolor" );
+        sfree( workspace->color_top, "Finalize_Workspace::workspace->color_top" );
+        sfree( workspace->permuted_row_col, "Finalize_Workspace::workspace->permuted_row_col" );
+        sfree( workspace->permuted_row_col_inv, "Finalize_Workspace::workspace->permuted_row_col_inv" );
+        sfree( workspace->y_p, "Finalize_Workspace::workspace->y_p" );
+        sfree( workspace->x_p, "Finalize_Workspace::workspace->x_p" );
+    }
+
+    /* Jacobi iteration related */
+    if ( control->cm_solver_pre_app_type == JACOBI_ITER_PA )
+    {
+        sfree( workspace->Dinv_L, "Finalize_Workspace::Dinv_L" );
+        sfree( workspace->Dinv_U, "Finalize_Workspace::Dinv_U" );
+        sfree( workspace->Dinv_b, "Finalize_Workspace::Dinv_b" );
+        sfree( workspace->rp, "Finalize_Workspace::rp" );
+        sfree( workspace->rp2, "Finalize_Workspace::rp2" );
+    }
+
     /* integrator storage */
     sfree( workspace->a, "Finalize_Workspace::workspace->a" );
     sfree( workspace->f_old, "Finalize_Workspace::workspace->f_old" );
@@ -1217,17 +1371,17 @@ void Finalize_Workspace( reax_system *system, control_params *control,
 
 void Finalize_Lists( control_params *control, reax_list **lists )
 {
-    Delete_List( TYP_FAR_NEIGHBOR, (*lists) + FAR_NBRS );
+    Delete_List( TYP_FAR_NEIGHBOR, &(*lists)[FAR_NBRS] );
     if ( control->hb_cut > 0.0 )
     {
-        Delete_List( TYP_HBOND, (*lists) + HBONDS );
+        Delete_List( TYP_HBOND, &(*lists)[HBONDS] );
     }
-    Delete_List( TYP_BOND, (*lists) + BONDS );
-    Delete_List( TYP_THREE_BODY, (*lists) + THREE_BODIES );
+    Delete_List( TYP_BOND, &(*lists)[BONDS] );
+    Delete_List( TYP_THREE_BODY, &(*lists)[THREE_BODIES] );
 
 #ifdef TEST_FORCES
-    Delete_List( TYP_DDELTA, (*lists) + DDELTA );
-    Delete_List( TYP_DBO, (*lists) + DBO );
+    Delete_List( TYP_DDELTA, &(*lists)[DDELTA] );
+    Delete_List( TYP_DBO, &(*lists)[DBO] );
 #endif
 }
 
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index c90f742ad9269d192ea55fb98e705b89bb767117..11b7d9801d5867f42080116b7322e650ed0604aa 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -50,52 +50,6 @@ enum preconditioner_type
 };
 
 
-/* global to make OpenMP shared (Sparse_MatVec) */
-#ifdef _OPENMP
-static real *b_local = NULL;
-#endif
-/* global to make OpenMP shared (apply_preconditioner) */
-static real *Dinv_L = NULL;
-static real *Dinv_U = NULL;
-/* global to make OpenMP shared (tri_solve_level_sched) */
-static int levels = 1;
-static int levels_L = 1;
-static int levels_U = 1;
-static unsigned int *row_levels_L = NULL;
-static unsigned int *level_rows_L = NULL;
-static unsigned int *level_rows_cnt_L = NULL;
-static unsigned int *row_levels_U = NULL;
-static unsigned int *level_rows_U = NULL;
-static unsigned int *level_rows_cnt_U = NULL;
-static unsigned int *row_levels;
-static unsigned int *level_rows;
-static unsigned int *level_rows_cnt;
-static unsigned int *top = NULL;
-/* global to make OpenMP shared (graph_coloring) */
-static unsigned int *color = NULL;
-static unsigned int *to_color = NULL;
-static unsigned int *conflict = NULL;
-static unsigned int *conflict_cnt = NULL;
-static unsigned int *temp_ptr;
-static unsigned int *recolor = NULL;
-static unsigned int recolor_cnt;
-static unsigned int *color_top = NULL;
-/* global to make OpenMP shared (sort_colors) */
-static unsigned int *permuted_row_col = NULL;
-static unsigned int *permuted_row_col_inv = NULL;
-static real *y_p = NULL;
-/* global to make OpenMP shared (permute_vector) */
-static real *x_p = NULL;
-static unsigned int *mapping = NULL;
-static sparse_matrix *H_full;
-static sparse_matrix *H_p;
-/* global to make OpenMP shared (jacobi_iter) */
-static real *Dinv_b = NULL;
-static real *rp = NULL;
-static real *rp2 = NULL;
-static real *rp3 = NULL;
-
-
 #if defined(TEST_MAT)
 static sparse_matrix * create_test_mat( void )
 {
@@ -225,7 +179,15 @@ static void compute_full_sparse_matrix( const sparse_matrix * const A,
             exit( INSUFFICIENT_MEMORY );
         }
     }
-
+    else if ( (*A_full)->m < 2 * A->m - A->n )
+    {
+        Deallocate_Matrix( *A_full );
+        if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE )
+        {
+            fprintf( stderr, "not enough memory for full A. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
 
     if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
     {
@@ -277,14 +239,14 @@ static void compute_full_sparse_matrix( const sparse_matrix * const A,
  *   A has non-zero diagonals
  *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
 void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix ** A_full,
-                                  sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full,
-                                  sparse_matrix ** A_app_inv, const real filter )
+              sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full,
+                    sparse_matrix ** A_app_inv, const real filter )
 {
     int i, pj, size;
-    real min, max, threshold, val;
-
-    min = 0.0;
-    max = 0.0;
+    int left, right, k, p, turn;
+    real pivot, tmp;
+    real threshold;
+    real *list;
 
     if ( *A_spar_patt == NULL )
     {
@@ -304,34 +266,87 @@ void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix *
         }
     }
 
-    /* find min and max elements of matrix */
-    for ( i = 0; i < A->n; ++i )
+
+    /* quick-select algorithm for finding the kth greatest element in the matrix*/
+    /* list: values from the matrix*/
+    /* left-right: search space of the quick-select */
+
+    list = (real *) smalloc( sizeof(real) * (A->start[A->n]),"Sparse_Approx_Inverse::list" );
+
+    left = 0;
+    right = A->start[A->n] - 1;
+    k = (int)( (A->start[A->n])*filter );
+    threshold = 0.0;
+
+    for( i = left; i <= right ; ++i )
     {
-        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
+        list[i] = A->val[i];
+        if(list[i] < 0.0)
         {
-            val = A->val[pj];
-            if ( pj == 0 )
-            {
-                min = val;
-                max = val;
-            }
-            else
+            list[i] = -list[i];
+        }
+    }
+
+    turn = 0;
+    while( k ) {
+
+        p  = left;
+        turn = 1 - turn;
+        if( turn == 1)
+        {
+            pivot = list[right];
+        }
+        else
+        {
+            pivot = list[left];
+        }
+        for( i = left + 1 - turn; i <= right-turn; ++i )
+        {
+            if( list[i] > pivot )
             {
-                if ( min > val )
-                {
-                    min = val;
-                }
-                if ( max < val )
-                {
-                    max = val;
-                }
+                tmp = list[i];
+                list[i] = list[p];
+                list[p] = tmp;
+                p++;
             }
         }
+        if(turn == 1)
+        {
+            tmp = list[p];
+            list[p] = list[right];
+            list[right] = tmp;
+        }
+        else
+        {
+            tmp = list[p];
+            list[p] = list[left];
+            list[left] = tmp;
+        }
+
+        if( p == k - 1)
+        {
+            threshold = list[p];
+            break;
+        }
+        else if( p > k - 1 )
+        {
+            right = p - 1;
+        }
+        else
+        {
+            left = p + 1;
+        }
+    }
+
+    if(threshold < 1.000000)
+    {
+        threshold = 1.000001;
     }
 
-    threshold = min + (max - min) * (1.0 - filter);
+    sfree( list, "setup_sparse_approx_inverse::list" );
 
     /* fill sparsity pattern */
+    /* diagonal entries are always included */
     for ( size = 0, i = 0; i < A->n; ++i )
     {
         (*A_spar_patt)->start[i] = size;
@@ -352,7 +367,7 @@ void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix *
     compute_full_sparse_matrix( *A_spar_patt, A_spar_patt_full );
 
     /* A_app_inv has the same sparsity pattern
-     * as A_spar_patt_full (omit non-zero values) */
+     * * as A_spar_patt_full (omit non-zero values) */
     if ( Allocate_Matrix( A_app_inv, (*A_spar_patt_full)->n, (*A_spar_patt_full)->m ) == FAILURE )
     {
         fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
@@ -360,6 +375,7 @@ void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix *
     }
 }
 
+
 void Calculate_Droptol( const sparse_matrix * const A,
                         real * const droptol, const real dtol )
 {
@@ -379,8 +395,6 @@ void Calculate_Droptol( const sparse_matrix * const A,
 
         #pragma omp master
         {
-            /* keep b_local for program duration to avoid allocate/free
-             * overhead per Sparse_MatVec call*/
             if ( droptol_local == NULL )
             {
                 droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real),
@@ -1622,11 +1636,23 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
 
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
 /* Compute M^{1} \approx A which minimizes
+ *  \sum_{j=1}^{N} ||e_j - Am_j||_2^2
+ *  where: e_j is the j-th column of the NxN identify matrix,
+ *         m_j is the j-th column of the NxN approximate sparse matrix M
+ * 
+ * Internally, use LAPACKE to solve the least-squares problems
  *
  * A: symmetric, sparse matrix, stored in full CSR format
  * A_spar_patt: sparse matrix used as template sparsity pattern
  *   for approximating the inverse, stored in full CSR format
- * A_app_inv: approximate inverse to A, stored in full CSR format (result) */
+ * A_app_inv: approximate inverse to A, stored in full CSR format (result)
+ *
+ * Reference:
+ * Michele Benzi et al.
+ * A Comparative Study of Sparse Approximate Inverse
+ *  Preconditioners
+ * Applied Numerical Mathematics 30, 1999
+ * */
 real sparse_approx_inverse( const sparse_matrix * const A,
                             const sparse_matrix * const A_spar_patt,
                             sparse_matrix ** A_app_inv )
@@ -1719,7 +1745,7 @@ real sparse_approx_inverse( const sparse_matrix * const A,
                 }
                 // change the value if any of the column indices is seen
                 for ( d_j = A->start[pos_x[d_i]];
-                        d_j < A->start[pos_x[d_i + 1]]; ++d_j )
+                        d_j < A->start[pos_x[d_i] + 1]; ++d_j )
                 {
                     if ( Y[A->j[d_j]] == 1 )
                     {
@@ -1794,12 +1820,13 @@ real sparse_approx_inverse( const sparse_matrix * const A,
 
 
 /* sparse matrix-vector product Ax = b
- * where:
- *   A: lower triangular matrix, stored in CSR format
- *   x: vector
- *   b: vector (result) */
-static void Sparse_MatVec( const sparse_matrix * const A,
-                           const real * const x, real * const b )
+ *
+ * workspace: storage container for workspace structures
+ * A: lower triangular matrix, stored in CSR format
+ * x: vector
+ * b (output): vector */
+static void Sparse_MatVec( const static_storage * const workspace,
+        const sparse_matrix * const A, const real * const x, real * const b )
 {
     int i, j, k, n, si, ei;
     real H;
@@ -1813,18 +1840,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 #ifdef _OPENMP
     tid = omp_get_thread_num( );
 
-    #pragma omp single
-    {
-        /* keep b_local for program duration to avoid allocate/free
-         * overhead per Sparse_MatVec call*/
-        if ( b_local == NULL )
-        {
-            b_local = (real*) smalloc( omp_get_num_threads() * n * sizeof(real),
-                                       "Sparse_MatVec::b_local" );
-        }
-    }
-
-    Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n );
+    Vector_MakeZero( workspace->b_local, omp_get_num_threads() * n );
 
     #pragma omp for schedule(static)
 #endif
@@ -1838,8 +1854,8 @@ static void Sparse_MatVec( const sparse_matrix * const A,
             j = A->j[k];
             H = A->val[k];
 #ifdef _OPENMP
-            b_local[tid * n + j] += H * x[i];
-            b_local[tid * n + i] += H * x[j];
+            workspace->b_local[tid * n + j] += H * x[i];
+            workspace->b_local[tid * n + i] += H * x[j];
 #else
             b[j] += H * x[i];
             b[i] += H * x[j];
@@ -1848,7 +1864,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 
         // the diagonal entry is the last one in
 #ifdef _OPENMP
-        b_local[tid * n + i] += A->val[k] * x[i];
+        workspace->b_local[tid * n + i] += A->val[k] * x[i];
 #else
         b[i] += A->val[k] * x[i];
 #endif
@@ -1860,7 +1876,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
     {
         for ( j = 0; j < omp_get_num_threads(); ++j )
         {
-            b[i] += b_local[j * n + i];
+            b[i] += workspace->b_local[j * n + i];
         }
     }
 #endif
@@ -2042,6 +2058,7 @@ void tri_solve( const sparse_matrix * const LU, const real * const y,
 
 /* Solve triangular system LU*x = y using level scheduling
  *
+ * workspace: storage container for workspace structures
  * LU: lower/upper triangular, stored in CSR
  * y: constants in linear system (RHS)
  * x: solution
@@ -2052,53 +2069,38 @@ void tri_solve( const sparse_matrix * const LU, const real * const y,
  * Assumptions:
  *   LU has non-zero diagonals
  *   Each row of LU has at least one non-zero (i.e., no rows with all zeros) */
-void tri_solve_level_sched( const sparse_matrix * const LU,
-                            const real * const y, real * const x, const int N,
-                            const TRIANGULARITY tri, int find_levels )
+void tri_solve_level_sched( static_storage * workspace,
+        const sparse_matrix * const LU,
+        const real * const y, real * const x, const int N,
+        const TRIANGULARITY tri, int find_levels )
 {
     int i, j, pj, local_row, local_level;
+    unsigned int *row_levels, *level_rows, *level_rows_cnt;
+    int levels;
+
+    if ( tri == LOWER )
+    {
+        row_levels = workspace->row_levels_L;
+        level_rows = workspace->level_rows_L;
+        level_rows_cnt = workspace->level_rows_cnt_L;
+    }
+    else
+    {
+        row_levels = workspace->row_levels_U;
+        level_rows = workspace->level_rows_U;
+        level_rows_cnt = workspace->level_rows_cnt_U;
+    }
 
 #ifdef _OPENMP
     #pragma omp single
 #endif
     {
-        if ( tri == LOWER )
-        {
-            row_levels = row_levels_L;
-            level_rows = level_rows_L;
-            level_rows_cnt = level_rows_cnt_L;
-            levels = levels_L;
-        }
-        else
-        {
-            row_levels = row_levels_U;
-            level_rows = level_rows_U;
-            level_rows_cnt = level_rows_cnt_U;
-            levels = levels_U;
-        }
-
-        if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
-        {
-            row_levels = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int),
-                                                  "tri_solve_level_sched::row_levels" );
-            level_rows = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int),
-                                                  "tri_solve_level_sched::level_rows" );
-            level_rows_cnt = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int),
-                             "tri_solve_level_sched::level_rows_cnt" );
-        }
-
-        if ( top == NULL )
-        {
-            top = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int),
-                                           "tri_solve_level_sched::top" );
-        }
-
         /* find levels (row dependencies in substitutions) */
         if ( find_levels == TRUE )
         {
             memset( row_levels, 0, N * sizeof(unsigned int) );
             memset( level_rows_cnt, 0, N * sizeof(unsigned int) );
-            memset( top, 0, N * sizeof(unsigned int) );
+            memset( workspace->top, 0, N * sizeof(unsigned int) );
             levels = 1;
 
             if ( tri == LOWER )
@@ -2116,6 +2118,8 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                     ++level_rows_cnt[local_level];
                 }
 
+                workspace->levels_L = levels;
+
                 //#if defined(DEBUG)
                 fprintf(stderr, "levels(L): %d\n", levels);
                 fprintf(stderr, "NNZ(L): %d\n", LU->start[N]);
@@ -2136,6 +2140,8 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                     ++level_rows_cnt[local_level];
                 }
 
+                workspace->levels_U = levels;
+
                 //#if defined(DEBUG)
                 fprintf(stderr, "levels(U): %d\n", levels);
                 fprintf(stderr, "NNZ(U): %d\n", LU->start[N]);
@@ -2145,17 +2151,26 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
             for ( i = 1; i < levels + 1; ++i )
             {
                 level_rows_cnt[i] += level_rows_cnt[i - 1];
-                top[i] = level_rows_cnt[i];
+                workspace->top[i] = level_rows_cnt[i];
             }
 
             for ( i = 0; i < N; ++i )
             {
-                level_rows[top[row_levels[i] - 1]] = i;
-                ++top[row_levels[i] - 1];
+                level_rows[workspace->top[row_levels[i] - 1]] = i;
+                ++workspace->top[row_levels[i] - 1];
             }
         }
     }
 
+    if ( tri == LOWER )
+    {
+        levels = workspace->levels_L;
+    }
+    else
+    {
+        levels = workspace->levels_U;
+    }
+
     /* perform substitutions by level */
     if ( tri == LOWER )
     {
@@ -2171,7 +2186,6 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                 for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                 {
                     x[local_row] -= LU->val[pj] * x[LU->j[pj]];
-
                 }
                 x[local_row] /= LU->val[pj];
             }
@@ -2191,79 +2205,18 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
                 for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                 {
                     x[local_row] -= LU->val[pj] * x[LU->j[pj]];
-
                 }
                 x[local_row] /= LU->val[LU->start[local_row]];
             }
         }
     }
-
-#ifdef _OPENMP
-    #pragma omp single
-#endif
-    {
-        /* save level info for re-use if performing repeated triangular solves via preconditioning */
-        if ( tri == LOWER )
-        {
-            row_levels_L = row_levels;
-            level_rows_L = level_rows;
-            level_rows_cnt_L = level_rows_cnt;
-            levels_L = levels;
-        }
-        else
-        {
-            row_levels_U = row_levels;
-            level_rows_U = level_rows;
-            level_rows_cnt_U = level_rows_cnt;
-            levels_U = levels;
-        }
-    }
-}
-
-
-static void compute_H_full( const sparse_matrix * const H )
-{
-    int count, i, pj;
-    sparse_matrix *H_t;
-
-    if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for full H. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    /* Set up the sparse matrix data structure for A. */
-    Transpose( H, H_t );
-
-    count = 0;
-    for ( i = 0; i < H->n; ++i )
-    {
-        H_full->start[i] = count;
-
-        /* H: symmetric, lower triangular portion only stored */
-        for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj )
-        {
-            H_full->val[count] = H->val[pj];
-            H_full->j[count] = H->j[pj];
-            ++count;
-        }
-        /* H^T: symmetric, upper triangular portion only stored;
-         * skip diagonal from H^T, as included from H above */
-        for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj )
-        {
-            H_full->val[count] = H_t->val[pj];
-            H_full->j[count] = H_t->j[pj];
-            ++count;
-        }
-    }
-    H_full->start[i] = count;
-
-    Deallocate_Matrix( H_t );
 }
 
 
 /* Iterative greedy shared-memory parallel graph coloring
  *
+ * control: container for control info
+ * workspace: storage container for workspace structures
  * A: matrix to use for coloring, stored in CSR format;
  *   rows represent vertices, columns of entries within a row represent adjacent vertices
  *   (i.e., dependent rows for elimination during LU factorization)
@@ -2276,31 +2229,40 @@ static void compute_H_full( const sparse_matrix * const H )
  *  and Massively Threaded Architectures
  * Parallel Computing, 2012
  */
-void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
+void graph_coloring( const control_params * const control,
+        static_storage * workspace,
+        const sparse_matrix * const A, const TRIANGULARITY tri )
 {
 #ifdef _OPENMP
     #pragma omp parallel
 #endif
     {
-#define MAX_COLOR (500)
         int i, pj, v;
         unsigned int temp, recolor_cnt_local, *conflict_local;
-        int tid, num_thread, *fb_color;
+        int tid, *fb_color;
+        unsigned int *p_to_color, *p_conflict, *p_temp;
 
 #ifdef _OPENMP
-        tid = omp_get_thread_num();
-        num_thread = omp_get_num_threads();
+        tid = omp_get_thread_num( );
 #else
         tid = 0;
-        num_thread = 1;
 #endif
+        p_to_color = workspace->to_color;
+        p_conflict = workspace->conflict;
+
+#ifdef _OPENMP
+        #pragma omp for schedule(static)
+#endif
+        for ( i = 0; i < A->n; ++i )
+        {
+            workspace->color[i] = 0;
+        }
 
 #ifdef _OPENMP
         #pragma omp single
 #endif
         {
-            memset( color, 0, sizeof(unsigned int) * A->n );
-            recolor_cnt = A->n;
+            workspace->recolor_cnt = A->n;
         }
 
         /* ordering of vertices to color depends on triangularity of factor
@@ -2312,7 +2274,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
 #endif
             for ( i = 0; i < A->n; ++i )
             {
-                to_color[i] = i;
+                p_to_color[i] = i;
             }
         }
         else
@@ -2322,57 +2284,56 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
 #endif
             for ( i = 0; i < A->n; ++i )
             {
-                to_color[i] = A->n - 1 - i;
+                p_to_color[i] = A->n - 1 - i;
             }
         }
 
-        fb_color = (int*) smalloc( sizeof(int) * MAX_COLOR,
-                                   "graph_coloring::fb_color" );
+        fb_color = (int*) smalloc( sizeof(int) * A->n,
+                "graph_coloring::fb_color" );
         conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n,
-                         "graph_coloring::fb_color" );
-
-#ifdef _OPENMP
-        #pragma omp barrier
-#endif
+                "graph_coloring::fb_color" );
 
-        while ( recolor_cnt > 0 )
+        while ( workspace->recolor_cnt > 0 )
         {
-            memset( fb_color, -1, sizeof(int) * MAX_COLOR );
+            memset( fb_color, -1, sizeof(int) * A->n );
 
             /* color vertices */
 #ifdef _OPENMP
             #pragma omp for schedule(static)
 #endif
-            for ( i = 0; i < recolor_cnt; ++i )
+            for ( i = 0; i < workspace->recolor_cnt; ++i )
             {
-                v = to_color[i];
+                v = p_to_color[i];
 
                 /* colors of adjacent vertices are forbidden */
                 for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                 {
                     if ( v != A->j[pj] )
                     {
-                        fb_color[color[A->j[pj]]] = v;
+                        fb_color[workspace->color[A->j[pj]]] = v;
                     }
                 }
 
                 /* search for min. color which is not in conflict with adjacent vertices;
                  * start at 1 since 0 is default (invalid) color for all vertices */
-                for ( pj = 1; fb_color[pj] == v; ++pj );
+                for ( pj = 1; fb_color[pj] == v; ++pj )
+                    ;
 
                 /* assign discovered color (no conflict in neighborhood of adjacent vertices) */
-                color[v] = pj;
+                workspace->color[v] = pj;
             }
 
             /* determine if recoloring required */
-            temp = recolor_cnt;
+            temp = workspace->recolor_cnt;
             recolor_cnt_local = 0;
 
 #ifdef _OPENMP
+            #pragma omp barrier
+
             #pragma omp single
 #endif
             {
-                recolor_cnt = 0;
+                workspace->recolor_cnt = 0;
             }
 
 #ifdef _OPENMP
@@ -2380,12 +2341,12 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
 #endif
             for ( i = 0; i < temp; ++i )
             {
-                v = to_color[i];
+                v = p_to_color[i];
 
                 /* search for color conflicts with adjacent vertices */
                 for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj )
                 {
-                    if ( color[v] == color[A->j[pj]] && v > A->j[pj] )
+                    if ( workspace->color[v] == workspace->color[A->j[pj]] && v > A->j[pj] )
                     {
                         conflict_local[recolor_cnt_local] = v;
                         ++recolor_cnt_local;
@@ -2395,134 +2356,107 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             }
 
             /* count thread-local conflicts and compute offsets for copying into shared buffer */
-            conflict_cnt[tid + 1] = recolor_cnt_local;
+            workspace->conflict_cnt[tid + 1] = recolor_cnt_local;
 
 #ifdef _OPENMP
             #pragma omp barrier
 
-            #pragma omp master
+            #pragma omp single
 #endif
             {
-                conflict_cnt[0] = 0;
-                for ( i = 1; i < num_thread + 1; ++i )
+                workspace->conflict_cnt[0] = 0;
+                for ( i = 1; i < control->num_threads + 1; ++i )
                 {
-                    conflict_cnt[i] += conflict_cnt[i - 1];
+                    workspace->conflict_cnt[i] += workspace->conflict_cnt[i - 1];
                 }
-                recolor_cnt = conflict_cnt[num_thread];
+                workspace->recolor_cnt = workspace->conflict_cnt[control->num_threads];
             }
 
-#ifdef _OPENMP
-            #pragma omp barrier
-#endif
-
             /* copy thread-local conflicts into shared buffer */
             for ( i = 0; i < recolor_cnt_local; ++i )
             {
-                conflict[conflict_cnt[tid] + i] = conflict_local[i];
-                color[conflict_local[i]] = 0;
+                p_conflict[workspace->conflict_cnt[tid] + i] = conflict_local[i];
+                workspace->color[conflict_local[i]] = 0;
             }
 
 #ifdef _OPENMP
             #pragma omp barrier
-
-            #pragma omp single
 #endif
-            {
-                temp_ptr = to_color;
-                to_color = conflict;
-                conflict = temp_ptr;
-            }
+            p_temp = p_to_color;
+            p_to_color = p_conflict;
+            p_conflict = p_temp;
         }
 
         sfree( conflict_local, "graph_coloring::conflict_local" );
         sfree( fb_color, "graph_coloring::fb_color" );
-
-        //#if defined(DEBUG)
-        //#ifdef _OPENMP
-        //    #pragma omp master
-        //#endif
-        //    {
-        //        for ( i = 0; i < A->n; ++i )
-        //            printf("Vertex: %5d, Color: %5d\n", i, color[i] );
-        //    }
-        //#endif
-
-#ifdef _OPENMP
-        #pragma omp barrier
-#endif
     }
 }
 
 
-/* Sort coloring
+/* Sort rows by coloring
  *
+ * workspace: storage container for workspace structures
  * n: number of entries in coloring
  * tri: coloring to triangular factor to use (lower/upper)
  */
-void sort_colors( const unsigned int n, const TRIANGULARITY tri )
+void sort_rows_by_colors( const static_storage * const workspace,
+        const unsigned int n, const TRIANGULARITY tri )
 {
     unsigned int i;
 
-    memset( color_top, 0, sizeof(unsigned int) * (n + 1) );
+    memset( workspace->color_top, 0, sizeof(unsigned int) * (n + 1) );
 
     /* sort vertices by color (ascending within a color)
      *  1) count colors
      *  2) determine offsets of color ranges
-     *  3) sort by color
+     *  3) sort rows by color
      *
      *  note: color is 1-based */
     for ( i = 0; i < n; ++i )
     {
-        ++color_top[color[i]];
+        ++workspace->color_top[workspace->color[i]];
     }
     for ( i = 1; i < n + 1; ++i )
     {
-        color_top[i] += color_top[i - 1];
+        workspace->color_top[i] += workspace->color_top[i - 1];
     }
     for ( i = 0; i < n; ++i )
     {
-        permuted_row_col[color_top[color[i] - 1]] = i;
-        ++color_top[color[i] - 1];
+        workspace->permuted_row_col[workspace->color_top[workspace->color[i] - 1]] = i;
+        ++workspace->color_top[workspace->color[i] - 1];
     }
 
     /* invert mapping to get map from current row/column to permuted (new) row/column */
     for ( i = 0; i < n; ++i )
     {
-        permuted_row_col_inv[permuted_row_col[i]] = i;
+        workspace->permuted_row_col_inv[workspace->permuted_row_col[i]] = i;
     }
 }
 
 
 /* Apply permutation Q^T*x or Q*x based on graph coloring
  *
+ * workspace: storage container for workspace structures
  * color: vertex color (1-based); vertices represent matrix rows/columns
  * x: vector to permute (in-place)
  * n: number of entries in x
  * invert_map: if TRUE, use Q^T, otherwise use Q
  * tri: coloring to triangular factor to use (lower/upper)
  */
-static void permute_vector( real * const x, const unsigned int n, const int invert_map,
-                            const TRIANGULARITY tri )
+static void permute_vector( const static_storage * const workspace,
+        real * const x, const unsigned int n, const int invert_map,
+        const TRIANGULARITY tri )
 {
     unsigned int i;
+    unsigned int *mapping;
 
-#ifdef _OPENMP
-    #pragma omp single
-#endif
+    if ( invert_map == TRUE )
     {
-        if ( x_p == NULL )
-        {
-            x_p = (real*) smalloc( sizeof(real) * n, "permute_vector::x_p" );
-        }
-
-        if ( invert_map == TRUE )
-        {
-            mapping = permuted_row_col_inv;
-        }
-        else
-        {
-            mapping = permuted_row_col;
-        }
+        mapping = workspace->permuted_row_col_inv;
+    }
+    else
+    {
+        mapping = workspace->permuted_row_col;
     }
 
 #ifdef _OPENMP
@@ -2530,25 +2464,28 @@ static void permute_vector( real * const x, const unsigned int n, const int inve
 #endif
     for ( i = 0; i < n; ++i )
     {
-        x_p[i] = x[mapping[i]];
+        workspace->x_p[i] = x[mapping[i]];
     }
 
 #ifdef _OPENMP
-    #pragma omp single
+    #pragma omp for schedule(static)
 #endif
+    for ( i = 0; i < n; ++i )
     {
-        memcpy( x, x_p, sizeof(real) * n );
+        x[i] = workspace->x_p[i];
     }
 }
 
 
 /* Apply permutation Q^T*(LU)*Q based on graph coloring
  *
+ * workspace: storage container for workspace structures
  * color: vertex color (1-based); vertices represent matrix rows/columns
  * LU: matrix to permute, stored in CSR format
  * tri: triangularity of LU (lower/upper)
  */
-void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
+void permute_matrix( const static_storage * const workspace,
+        sparse_matrix * const LU, const TRIANGULARITY tri )
 {
     int i, pj, nr, nc;
     sparse_matrix *LUtemp;
@@ -2560,26 +2497,26 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
     }
 
     /* count nonzeros in each row of permuted factor (re-use color_top for counting) */
-    memset( color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
+    memset( workspace->color_top, 0, sizeof(unsigned int) * (LU->n + 1) );
 
     if ( tri == LOWER )
     {
         for ( i = 0; i < LU->n; ++i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc <= nr )
                 {
-                    ++color_top[nr + 1];
+                    ++workspace->color_top[nr + 1];
                 }
                 /* correct entries to maintain triangularity (lower) */
                 else
                 {
-                    ++color_top[nc + 1];
+                    ++workspace->color_top[nc + 1];
                 }
             }
         }
@@ -2588,20 +2525,20 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
     {
         for ( i = LU->n - 1; i >= 0; --i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc >= nr )
                 {
-                    ++color_top[nr + 1];
+                    ++workspace->color_top[nr + 1];
                 }
                 /* correct entries to maintain triangularity (upper) */
                 else
                 {
-                    ++color_top[nc + 1];
+                    ++workspace->color_top[nc + 1];
                 }
             }
         }
@@ -2609,34 +2546,34 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
 
     for ( i = 1; i < LU->n + 1; ++i )
     {
-        color_top[i] += color_top[i - 1];
+        workspace->color_top[i] += workspace->color_top[i - 1];
     }
 
-    memcpy( LUtemp->start, color_top, sizeof(unsigned int) * (LU->n + 1) );
+    memcpy( LUtemp->start, workspace->color_top, sizeof(unsigned int) * (LU->n + 1) );
 
     /* permute factor */
     if ( tri == LOWER )
     {
         for ( i = 0; i < LU->n; ++i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc <= nr )
                 {
-                    LUtemp->j[color_top[nr]] = nc;
-                    LUtemp->val[color_top[nr]] = LU->val[pj];
-                    ++color_top[nr];
+                    LUtemp->j[workspace->color_top[nr]] = nc;
+                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
+                    ++workspace->color_top[nr];
                 }
                 /* correct entries to maintain triangularity (lower) */
                 else
                 {
-                    LUtemp->j[color_top[nc]] = nr;
-                    LUtemp->val[color_top[nc]] = LU->val[pj];
-                    ++color_top[nc];
+                    LUtemp->j[workspace->color_top[nc]] = nr;
+                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
+                    ++workspace->color_top[nc];
                 }
             }
         }
@@ -2645,24 +2582,24 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
     {
         for ( i = LU->n - 1; i >= 0; --i )
         {
-            nr = permuted_row_col_inv[i];
+            nr = workspace->permuted_row_col_inv[i];
 
             for ( pj = LU->start[i]; pj < LU->start[i + 1]; ++pj )
             {
-                nc = permuted_row_col_inv[LU->j[pj]];
+                nc = workspace->permuted_row_col_inv[LU->j[pj]];
 
                 if ( nc >= nr )
                 {
-                    LUtemp->j[color_top[nr]] = nc;
-                    LUtemp->val[color_top[nr]] = LU->val[pj];
-                    ++color_top[nr];
+                    LUtemp->j[workspace->color_top[nr]] = nc;
+                    LUtemp->val[workspace->color_top[nr]] = LU->val[pj];
+                    ++workspace->color_top[nr];
                 }
                 /* correct entries to maintain triangularity (upper) */
                 else
                 {
-                    LUtemp->j[color_top[nc]] = nr;
-                    LUtemp->val[color_top[nc]] = LU->val[pj];
-                    ++color_top[nc];
+                    LUtemp->j[workspace->color_top[nc]] = nr;
+                    LUtemp->val[workspace->color_top[nc]] = LU->val[pj];
+                    ++workspace->color_top[nc];
                 }
             }
         }
@@ -2680,62 +2617,43 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
  *  used for preconditioning (incomplete factorizations computed based on
  *  permuted H)
  *
+ * control: container for control info
+ * workspace: storage container for workspace structures
  * H: symmetric, lower triangular portion only, stored in CSR format;
- *  H is permuted in-place
+ * H_full: symmetric, stored in CSR format;
+ * H_p (output): permuted copy of H based on coloring, lower half stored, CSR format
  */
-sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
+void setup_graph_coloring( const control_params * const control,
+        const static_storage * const workspace, const sparse_matrix * const H,
+        sparse_matrix ** H_full, sparse_matrix ** H_p )
 {
-    int num_thread;
-
-    if ( color == NULL )
+    if ( *H_p == NULL )
     {
-#ifdef _OPENMP
-        #pragma omp parallel
+        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
         {
-            num_thread = omp_get_num_threads();
+            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
+            exit( INSUFFICIENT_MEMORY );
         }
-#else
-        num_thread = 1;
-#endif
-
-        /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
-        color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                         "setup_graph_coloring::color" );
-        to_color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                            "setup_graph_coloring::to_color" );
-        conflict = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                            "setup_graph_coloring::conflict" );
-        conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (num_thread + 1),
-                                                "setup_graph_coloring::conflict_cnt" );
-        recolor = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                                           "setup_graph_coloring::recolor" );
-        color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (H->n + 1),
-                                             "setup_graph_coloring::color_top" );
-        permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                           "setup_graph_coloring::premuted_row_col" );
-        permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
-                               "setup_graph_coloring::premuted_row_col_inv" );
-        y_p = (real*) smalloc( sizeof(real) * H->n,
-                               "setup_graph_coloring::y_p" );
-        if ( (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
-                (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
-        {
-            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
+    }
+    else if ( (*H_p)->m < H->m )
+    {
+        Deallocate_Matrix( *H_p );
+        if ( Allocate_Matrix( H_p, H->n, H->m ) == FAILURE )
+        {
+            fprintf( stderr, "[ERROR] not enough memory for graph coloring. terminating.\n" );
             exit( INSUFFICIENT_MEMORY );
         }
     }
 
-    compute_H_full( H );
+    compute_full_sparse_matrix( H, H_full );
 
-    graph_coloring( H_full, LOWER );
-    sort_colors( H_full->n, LOWER );
+    graph_coloring( control, (static_storage *) workspace, *H_full, LOWER );
+    sort_rows_by_colors( workspace, (*H_full)->n, LOWER );
 
-    memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
-    memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
-    memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
-    permute_matrix( H_p, LOWER );
-
-    return H_p;
+    memcpy( (*H_p)->start, H->start, sizeof(int) * (H->n + 1) );
+    memcpy( (*H_p)->j, H->j, sizeof(int) * (H->start[H->n]) );
+    memcpy( (*H_p)->val, H->val, sizeof(real) * (H->start[H->n]) );
+    permute_matrix( workspace, (*H_p), LOWER );
 }
 
 
@@ -2749,36 +2667,31 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
  * triangular factors in iterative linear solvers
  *
  * Note: Newmann series arises from series expansion of the inverse of
- * the coefficient matrix in the triangular system */
-void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
-                  const real * const b, real * const x, const TRIANGULARITY tri, const
-                  unsigned int maxiter )
+ * the coefficient matrix in the triangular system
+ *
+ * workspace: storage container for workspace structures
+ * R:
+ * Dinv:
+ * b:
+ * x (output):
+ * tri:
+ * maxiter:
+ * */
+void jacobi_iter( const static_storage * const workspace,
+        const sparse_matrix * const R, const real * const Dinv,
+        const real * const b, real * const x, const TRIANGULARITY tri,
+        const unsigned int maxiter )
 {
     unsigned int i, k, si, ei, iter;
+    real *p1, *p2, *p3;
 
     si = 0;
     ei = 0;
     iter = 0;
+    p1 = workspace->rp;
+    p2 = workspace->rp2;
 
-#ifdef _OPENMP
-    #pragma omp single
-#endif
-    {
-        if ( Dinv_b == NULL )
-        {
-            Dinv_b = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::Dinv_b" );
-        }
-        if ( rp == NULL )
-        {
-            rp = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp" );
-        }
-        if ( rp2 == NULL )
-        {
-            rp2 = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp2" );
-        }
-    }
-
-    Vector_MakeZero( rp, R->n );
+    Vector_MakeZero( p1, R->n );
 
     /* precompute and cache, as invariant in loop below */
 #ifdef _OPENMP
@@ -2786,12 +2699,12 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
 #endif
     for ( i = 0; i < R->n; ++i )
     {
-        Dinv_b[i] = Dinv[i] * b[i];
+        workspace->Dinv_b[i] = Dinv[i] * b[i];
     }
 
     do
     {
-        // x_{k+1} = G*x_{k} + Dinv*b;
+        /* x_{k+1} = G*x_{k} + Dinv*b */
 #ifdef _OPENMP
         #pragma omp for schedule(guided)
 #endif
@@ -2809,31 +2722,26 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
                 ei = R->start[i + 1];
             }
 
-            rp2[i] = 0.;
+            p2[i] = 0.;
 
             for ( k = si; k < ei; ++k )
             {
-                rp2[i] += R->val[k] * rp[R->j[k]];
+                p2[i] += R->val[k] * p1[R->j[k]];
             }
 
-            rp2[i] *= -Dinv[i];
-            rp2[i] += Dinv_b[i];
+            p2[i] *= -Dinv[i];
+            p2[i] += workspace->Dinv_b[i];
         }
 
-#ifdef _OPENMP
-        #pragma omp single
-#endif
-        {
-            rp3 = rp;
-            rp = rp2;
-            rp2 = rp3;
-        }
+        p3 = p1;
+        p1 = p2;
+        p2 = p3;
 
         ++iter;
     }
     while ( iter < maxiter );
 
-    Vector_Copy( x, rp, R->n );
+    Vector_Copy( x, p1, R->n );
 }
 
 
@@ -2901,7 +2809,8 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ICHOLT_PC:
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
-                    tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
+                    tri_solve_level_sched( (static_storage *) workspace,
+                            workspace->L, y, x, workspace->L->n, LOWER, fresh_pre );
                     break;
                 case SAI_PC:
                     Sparse_MatVec_full( workspace->H_app_inv, y, x );
@@ -2924,15 +2833,17 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
 #ifdef _OPENMP
-                    #pragma omp single
+                    #pragma omp for schedule(static)
 #endif
-                {
-                    memcpy( y_p, y, sizeof(real) * workspace->H->n );
-                }
+                    for ( i = 0; i < workspace->H->n; ++i )
+                    {
+                        workspace->y_p[i] = y[i];
+                    }
 
-                permute_vector( y_p, workspace->H->n, FALSE, LOWER );
-                tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre );
-                break;
+                    permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER );
+                    tri_solve_level_sched( (static_storage *) workspace,
+                            workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -2950,18 +2861,7 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ICHOLT_PC:
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
-#ifdef _OPENMP
-                    #pragma omp single
-#endif
-                    {
-                        if ( Dinv_L == NULL )
-                        {
-                            Dinv_L = (real*) smalloc( sizeof(real) * workspace->L->n,
-                                                      "apply_preconditioner::Dinv_L" );
-                        }
-                    }
-
-                        /* construct D^{-1}_L */
+                    /* construct D^{-1}_L */
                     if ( fresh_pre == TRUE )
                     {
 #ifdef _OPENMP
@@ -2970,12 +2870,13 @@ static void apply_preconditioner( const static_storage * const workspace, const
                         for ( i = 0; i < workspace->L->n; ++i )
                         {
                             si = workspace->L->start[i + 1] - 1;
-                            Dinv_L[i] = 1.0 / workspace->L->val[si];
+                            workspace->Dinv_L[i] = 1.0 / workspace->L->val[si];
                         }
                     }
 
-                    jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
-                break;
+                    jacobi_iter( workspace, workspace->L, workspace->Dinv_L,
+                            y, x, LOWER, control->cm_solver_pre_app_jacobi_iters );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -3027,7 +2928,8 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ICHOLT_PC:
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
-                    tri_solve_level_sched( workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+                    tri_solve_level_sched( (static_storage *) workspace,
+                            workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
                     break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
@@ -3046,9 +2948,10 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ICHOLT_PC:
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
-                tri_solve_level_sched( workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
-                permute_vector( x, workspace->H->n, TRUE, UPPER );
-                break;
+                    tri_solve_level_sched( (static_storage *) workspace,
+                            workspace->U, y, x, workspace->U->n, UPPER, fresh_pre );
+                    permute_vector( workspace, x, workspace->H->n, TRUE, UPPER );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -3066,32 +2969,22 @@ static void apply_preconditioner( const static_storage * const workspace, const
                 case ICHOLT_PC:
                 case ILU_PAR_PC:
                 case ILUT_PAR_PC:
-#ifdef _OPENMP
-                #pragma omp single
-#endif
-                {
-                    if ( Dinv_U == NULL )
-                    {
-                        Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n,
-                                                  "apply_preconditioner::Dinv_U" );
-                    }
-                }
-
                     /* construct D^{-1}_U */
-                if ( fresh_pre == TRUE )
-                {
+                    if ( fresh_pre == TRUE )
+                    {
 #ifdef _OPENMP
-                    #pragma omp for schedule(static)
+                        #pragma omp for schedule(static)
 #endif
-                    for ( i = 0; i < workspace->U->n; ++i )
-                    {
-                        si = workspace->U->start[i];
-                        Dinv_U[i] = 1. / workspace->U->val[si];
+                        for ( i = 0; i < workspace->U->n; ++i )
+                        {
+                            si = workspace->U->start[i];
+                            workspace->Dinv_U[i] = 1.0 / workspace->U->val[si];
+                        }
                     }
-                }
 
-                jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
-                break;
+                    jacobi_iter( workspace, workspace->U, workspace->Dinv_U,
+                            y, x, UPPER, control->cm_solver_pre_app_jacobi_iters );
+                    break;
                 default:
                     fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" );
                     exit( INVALID_INPUT );
@@ -3152,7 +3045,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
         {
             /* calculate r0 */
             t_start = Get_Time( );
-            Sparse_MatVec( H, x, workspace->b_prm );
+            Sparse_MatVec( workspace, H, x, workspace->b_prm );
             t_spmv += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
@@ -3182,7 +3075,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
             {
                 /* matvec */
                 t_start = Get_Time( );
-                Sparse_MatVec( H, workspace->v[j], workspace->b_prc );
+                Sparse_MatVec( workspace, H, workspace->v[j], workspace->b_prc );
                 t_spmv += Get_Timing_Info( t_start );
 
                 t_start = Get_Time( );
@@ -3300,27 +3193,19 @@ int GMRES( const static_storage * const workspace, const control_params * const
         }
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_orthog += t_ortho / control->num_threads;
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_orthog += t_ortho;
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_tri_solve += t_ts;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters )
     {
         fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
-        return g_itr * control->cm_solver_restart + g_j;
+        return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
     }
 
-    return g_itr * control->cm_solver_restart + g_j + 1;
+    return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
 }
 
 
@@ -3368,7 +3253,7 @@ int GMRES_HouseHolder( const static_storage * const workspace,
         {
             /* compute z = r0 */
             t_start = Get_Time( );
-            Sparse_MatVec( H, x, workspace->b_prm );
+            Sparse_MatVec( workspace, H, x, workspace->b_prm );
             t_spmv += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
@@ -3409,7 +3294,7 @@ int GMRES_HouseHolder( const static_storage * const workspace,
 
                 /* matvec */
                 t_start = Get_Time( );
-                Sparse_MatVec( H, z[j], workspace->b_prc );
+                Sparse_MatVec( workspace, H, z[j], workspace->b_prc );
                 t_spmv += Get_Timing_Info( t_start );
 
                 t_start = Get_Time( );
@@ -3546,19 +3431,11 @@ int GMRES_HouseHolder( const static_storage * const workspace,
         }
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_orthog += t_ortho / control->num_threads;
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_orthog += t_ortho;
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_tri_solve += t_ts;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters )
     {
@@ -3606,7 +3483,7 @@ int CG( const static_storage * const workspace, const control_params * const con
         t_vops += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
-        Sparse_MatVec( H, x, d );
+        Sparse_MatVec( workspace, H, x, d );
         t_spmv += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
@@ -3627,7 +3504,7 @@ int CG( const static_storage * const workspace, const control_params * const con
         for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
         {
             t_start = Get_Time( );
-            Sparse_MatVec( H, p, d );
+            Sparse_MatVec( workspace, H, p, d );
             t_spmv += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
@@ -3657,15 +3534,9 @@ int CG( const static_storage * const workspace, const control_params * const con
         g_itr = i;
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters )
     {
@@ -3708,7 +3579,7 @@ int SDM( const static_storage * const workspace, const control_params * const co
         t_vops += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
-        Sparse_MatVec( H, x, workspace->q );
+        Sparse_MatVec( workspace, H, x, workspace->q );
         t_spmv += Get_Timing_Info( t_start );
 
         t_start = Get_Time( );
@@ -3727,7 +3598,7 @@ int SDM( const static_storage * const workspace, const control_params * const co
         for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i )
         {
             t_start = Get_Time( );
-            Sparse_MatVec( H, workspace->d, workspace->q );
+            Sparse_MatVec( workspace, H, workspace->d, workspace->q );
             t_spmv += Get_Timing_Info( t_start );
 
             t_start = Get_Time( );
@@ -3760,15 +3631,9 @@ int SDM( const static_storage * const workspace, const control_params * const co
         g_itr = i;
     }
 
-#ifdef _OPENMP
     data->timing.cm_solver_pre_app += t_pa / control->num_threads;
     data->timing.cm_solver_spmv += t_spmv / control->num_threads;
     data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
-#else
-    data->timing.cm_solver_pre_app += t_pa;
-    data->timing.cm_solver_spmv += t_spmv;
-    data->timing.cm_solver_vector_ops += t_vops;
-#endif
 
     if ( g_itr >= control->cm_solver_max_iters  )
     {
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index 814701fe06c81a4fcd33530f9a16f70c12abdf5d..ab0237c14281eb31ddabcd41087b00a475d0ccd0 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -73,15 +73,19 @@ void Transpose_I( sparse_matrix * const );
 void tri_solve( const sparse_matrix * const, const real * const,
         real * const, const int, const TRIANGULARITY );
 
-void tri_solve_level_sched( const sparse_matrix * const,
+void tri_solve_level_sched( static_storage *,
+        const sparse_matrix * const,
         const real * const, real * const, const int,
         const TRIANGULARITY, int );
 
-void jacobi_iter( const sparse_matrix * const, const real * const,
+void jacobi_iter( const static_storage * const,
+        const sparse_matrix * const, const real * const,
         const real * const, real * const, const TRIANGULARITY,
         const unsigned int );
 
-sparse_matrix * setup_graph_coloring( sparse_matrix * const );
+void setup_graph_coloring( const control_params * const,
+        const static_storage * const, const sparse_matrix * const,
+        sparse_matrix **, sparse_matrix ** );
 
 int GMRES( const static_storage * const, const control_params * const,
         simulation_data * const, const sparse_matrix * const,
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index e46d217c766685ed6c6d6153f83e2679e100b6b0..61de424a6363a79a6ce51f993ebade2ca7d7abbd 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -617,9 +617,7 @@ typedef struct
     int num_ignored;
     int ignore[MAX_ATOM_TYPES];
 
-#ifdef _OPENMP
     int num_threads;
-#endif
 } control_params;
 
 
@@ -947,6 +945,7 @@ typedef struct
     sparse_matrix *H;
     sparse_matrix *H_full;
     sparse_matrix *H_sp;
+    sparse_matrix *H_p;
     sparse_matrix *H_spar_patt;
     sparse_matrix *H_spar_patt_full;
     sparse_matrix *H_app_inv;
@@ -971,12 +970,52 @@ typedef struct
     real **h;
     real **rn;
     real **v;
+
     /* CG related storage */
     real *r;
     real *d;
     real *q;
     real *p;
 
+    /* SpMV related storage */
+#ifdef _OPENMP
+    real *b_local;
+#endif
+
+    /* Level scheduling related storage for applying, e.g. ICHOLT and ILU(T),
+     * preconditioners */
+    int levels_L;
+    int levels_U;
+    unsigned int *row_levels_L;
+    unsigned int *level_rows_L;
+    unsigned int *level_rows_cnt_L;
+    unsigned int *row_levels_U;
+    unsigned int *level_rows_U;
+    unsigned int *level_rows_cnt_U;
+    unsigned int *top;
+
+    /* Graph coloring related storage for applying, e.g. ICHOLT and ILU(T),
+     * preconditioners */
+    unsigned int *color;
+    unsigned int *to_color;
+    unsigned int *conflict;
+    unsigned int *conflict_cnt;
+    unsigned int *recolor;
+    unsigned int recolor_cnt;
+    unsigned int *color_top;
+    unsigned int *permuted_row_col;
+    unsigned int *permuted_row_col_inv;
+    real *y_p;
+    real *x_p;
+
+    /* Jacobi iteration related storage for applying, e.g. ICHOLT and ILU(T),
+     * preconditioners */
+    real *Dinv_L;
+    real *Dinv_U;
+    real *Dinv_b;
+    real *rp;
+    real *rp2;
+
     int num_H;
     int *hbond_index; // for hydrogen bonds
 
diff --git a/sPuReMD/src/neighbors.c b/sPuReMD/src/neighbors.c
index c0351c857bc4709d729311922f3ecbef8192675a..b63a7fca09a8249c2ab2cab71ffc7741f9df9200 100644
--- a/sPuReMD/src/neighbors.c
+++ b/sPuReMD/src/neighbors.c
@@ -71,9 +71,11 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     t_start = Get_Time( );
     // fprintf( stderr, "\n\tentered nbrs - " );
     g = &( system->g );
-    far_nbrs = (*lists) + FAR_NBRS;
+    far_nbrs = &(*lists)[FAR_NBRS];
+
     Bin_Atoms( system, workspace );
     // fprintf( stderr, "atoms sorted - " );
+
     num_far = 0;
 
     /* first pick up a cell in the grid */
@@ -365,25 +367,29 @@ static inline int is_Near_Neighbor( reax_list *near_nbrs, int atom1, int atom2 )
     return TRUE;
 }
 
+
 void Generate_Neighbor_Lists( reax_system *system, control_params *control,
                               simulation_data *data, static_storage *workspace,
                               reax_list **lists, output_controls *out_control )
 {
-    int  i, j, k;
-    int  x, y, z;
-    int  *nbr_atoms;
-    int  atom1, atom2, max;
-    int   num_far;
-    int   c, count;
-    int   grid_top;
-    grid *g = &( system->g );
-    reax_list *far_nbrs = (*lists) + FAR_NBRS;
+    int i, j, k;
+    int x, y, z;
+    int *nbr_atoms;
+    int atom1, atom2, max;
+    int num_far;
+    int c, count;
+    int grid_top;
+    grid *g;
+    reax_list *far_nbrs;
     //int   hb_type1, hb_type2;
-    //reax_list *hbonds = (*lists) + HBOND;
+    //reax_list *hbonds = &(*lists)[HBOND];
     //int   top_hbond1, top_hbond2;
     get_far_neighbors_function Get_Far_Neighbors;
     far_neighbor_data new_nbrs[125];
 
+    g = &( system->g );
+    far_nbrs = &(*lists)[FAR_NBRS];
+
     // fprintf( stderr, "\n\tentered nbrs - " );
     if ( control->ensemble == iNPT || control->ensemble == sNPT ||
             control->ensemble == NPT )
@@ -632,7 +638,7 @@ void Generate_Neighbor_Lists( reax_system *system, control_params *control,
     far_neighbor_data new_nbrs[125];
 
     g = &( system->g );
-    far_nbrs = (*lists) + FAR_NBRS;
+    far_nbrs = &(*lists)[FAR_NBRS];
 
     // fprintf( stderr, "\n\tentered nbrs - " );
     if ( control->ensemble == iNPT ||
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index 30c071251bdfcfb9a55f147dbeab16b50994132b..8c3ea3095354baddad4c763e69357de35132c5a6 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -42,8 +42,8 @@ void Print_Bond_Orders( reax_system *system, control_params *control,
 {
     int  i, pj, pk;
     bond_order_data *bo_ij;
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *dBOs  = (*lists) + DBO;
+    reax_list *bonds = &(*lists)[BONDS];
+    reax_list *dBOs  = &(*lists)[DBO];
     dbond_data *dbo_k;
 
     /* bond orders */
@@ -473,7 +473,7 @@ void Print_Far_Neighbors( reax_system *system, control_params *control,
     int i, j, id_i, id_j;
     char fname[MAX_STR];
     FILE *fout;
-    reax_list *far_nbrs = &((*lists)[FAR_NBRS]);
+    reax_list *far_nbrs = &(*lists)[FAR_NBRS];
 
     snprintf( fname, MAX_STR, "%.*s.far_nbrs", MAX_STR - 10, control->sim_name );
     fout = fopen( fname, "w" );
@@ -518,7 +518,7 @@ void Print_Far_Neighbors2( reax_system *system, control_params *control,
     int i, j, id_i, id_j;
     char fname[MAX_STR];
     FILE *fout;
-    reax_list *far_nbrs = &((*lists)[FAR_NBRS]);
+    reax_list *far_nbrs = &(*lists)[FAR_NBRS];
 
     snprintf( fname, MAX_STR, "%.*s.far_nbrs_lgj", MAX_STR - 14, control->sim_name );
     fout = fopen( fname, "w" );
@@ -720,7 +720,7 @@ void Output_Results( reax_system *system, control_params *control,
         out_control->append_traj_frame( system, control, data,
                 workspace, lists, out_control );
 
-        //Write_PDB( system, *lists+BONDS, data, control, workspace, out_control );
+        //Write_PDB( system, &(*lists)[BONDS], data, control, workspace, out_control );
         //t_elapsed = Get_Timing_Info( t_start );
         //fprintf(stdout, "append_frame took %.6f seconds\n", t_elapsed );
     }
diff --git a/sPuReMD/src/reset_utils.c b/sPuReMD/src/reset_utils.c
index 0aed2a953efe94d904eccc3d17c8725ee214f96e..3edfe9072c0b635f7d3cfb97e555dd37d19d4a37 100644
--- a/sPuReMD/src/reset_utils.c
+++ b/sPuReMD/src/reset_utils.c
@@ -118,8 +118,11 @@ void Reset_Neighbor_Lists( reax_system *system, control_params *control,
                            static_storage *workspace, reax_list **lists )
 {
     int i, tmp;
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *hbonds = (*lists) + HBONDS;
+    reax_list *bonds;
+    reax_list *hbonds;
+
+    bonds = &(*lists)[BONDS];
+    hbonds = &(*lists)[HBONDS];
 
     for ( i = 0; i < system->N; ++i )
     {
diff --git a/sPuReMD/src/single_body_interactions.c b/sPuReMD/src/single_body_interactions.c
index b6edce1db6b7becfd311334fec83568625276261..4a9ff3102d5d86e33db27d71c19ed101a465c54f 100644
--- a/sPuReMD/src/single_body_interactions.c
+++ b/sPuReMD/src/single_body_interactions.c
@@ -20,6 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "single_body_interactions.h"
+
 #include "bond_orders.h"
 #include "list.h"
 #include "lookup.h"
@@ -45,8 +46,9 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system, control_params
     two_body_parameters *twbp;
     bond_data *pbond;
     bond_order_data *bo_ij;
-    reax_list *bonds = (*lists) + BONDS;
+    reax_list *bonds;
 
+    bonds = &(*lists)[BONDS];
     /* Initialize parameters */
     p_lp1 = system->reaxprm.gp.l[15];
     p_lp3 = system->reaxprm.gp.l[5];
diff --git a/sPuReMD/src/spuremd.h b/sPuReMD/src/spuremd.h
index ed2b98d3cc21ab0a68d5195d4e33ffe422685dfd..3a78242bd768e2674c727ac882f74b69b2a5e336 100644
--- a/sPuReMD/src/spuremd.h
+++ b/sPuReMD/src/spuremd.h
@@ -29,6 +29,10 @@
 #include "mytypes.h"
 
 
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void* setup( const char * const, const char * const,
         const char * const );
 
@@ -40,5 +44,9 @@ int cleanup( const void * const );
 
 reax_atom* get_atoms( const void * const );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/sPuReMD/src/three_body_interactions.c b/sPuReMD/src/three_body_interactions.c
index 159e0cc17f354c94172c01729a17f473f941b51b..485be5fdf934f5e0d113a35569a44ed44a5fb14f 100644
--- a/sPuReMD/src/three_body_interactions.c
+++ b/sPuReMD/src/three_body_interactions.c
@@ -93,9 +93,9 @@ void Three_Body_Interactions( reax_system *system, control_params *control,
     real e_ang_total, e_pen_total, e_coa_total;
 
     total_bo = workspace->total_bond_order;
-    bonds = (*lists) + BONDS;
+    bonds = &(*lists)[BONDS];
     bond_list = bonds->select.bond_list;
-    thb_intrs = (*lists) + THREE_BODIES;
+    thb_intrs = &(*lists)[THREE_BODIES];
     thb_list = thb_intrs->select.three_body_list;
     /* global parameters used in these calculations */
     p_pen2 = system->reaxprm.gp.l[19];
@@ -705,9 +705,9 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
 #ifdef TEST_FORCES
         num_hb_intrs = 0;
 #endif
-        bonds = (*lists) + BONDS;
+        bonds = &(*lists)[BONDS];
         bond_list = bonds->select.bond_list;
-        hbonds = (*lists) + HBONDS;
+        hbonds = &(*lists)[HBONDS];
         hbond_list = hbonds->select.hbond_list;
 
         /* loops below discover the Hydrogen bonds between i-j-k triplets.
diff --git a/sPuReMD/src/traj.c b/sPuReMD/src/traj.c
index 7972d0868c47fc992f909d9be457c3d454743e44..03cc5d74e1a644fdc10d84e8c6b57d9331c1a644 100644
--- a/sPuReMD/src/traj.c
+++ b/sPuReMD/src/traj.c
@@ -178,8 +178,8 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     int frame_globals_len, num_bonds, num_thb_intrs;
     real P;
     char buffer[SIZE];
-    reax_list *bonds = (*lists) + BONDS;
-    reax_list *thb_intrs =  (*lists) + THREE_BODIES;
+    reax_list *bonds = &(*lists)[BONDS];
+    reax_list *thb_intrs = &(*lists)[THREE_BODIES];
     bond_data *bo_ij;
 
     /* IMPORTANT: This whole part will go to init_trj after finalized! */
diff --git a/sPuReMD/src/two_body_interactions.c b/sPuReMD/src/two_body_interactions.c
index 664d7c0f3088d1b11beceb6e6a2647c49fb46cb6..d83db947b82936725eb37ad9ac9deb83029abbed 100644
--- a/sPuReMD/src/two_body_interactions.c
+++ b/sPuReMD/src/two_body_interactions.c
@@ -35,7 +35,7 @@ void Bond_Energy( reax_system *system, control_params *control,
     real gp3, gp4, gp7, gp10, gp37, ebond_total;
     reax_list *bonds;
 
-    bonds = (*lists) + BONDS;
+    bonds = &(*lists)[BONDS];
     gp3 = system->reaxprm.gp.l[3];
     gp4 = system->reaxprm.gp.l[4];
     gp7 = system->reaxprm.gp.l[7];
@@ -177,7 +177,7 @@ void vdW_Coulomb_Energy( reax_system *system, control_params *control,
 
     p_vdW1 = system->reaxprm.gp.l[28];
     p_vdW1i = 1.0 / p_vdW1;
-    far_nbrs = (*lists) + FAR_NBRS;
+    far_nbrs = &(*lists)[FAR_NBRS];
     e_vdW_total = 0.0;
     e_ele_total = 0.0;
 
@@ -527,7 +527,7 @@ void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
     reax_list *far_nbrs;
     real e_vdW_total, e_ele_total;
 
-    far_nbrs = (*lists) + FAR_NBRS;
+    far_nbrs = &(*lists)[FAR_NBRS];
     steps = data->step - data->prev_steps;
     update_freq = out_control->energy_update_freq;
     update_energies = update_freq > 0 && steps % update_freq == 0;
diff --git a/sPuReMD/tests/test_lin_alg.cpp b/sPuReMD/tests/test_lin_alg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a475f8c3dacb419d63b95983e3078b3eb89022e7
--- /dev/null
+++ b/sPuReMD/tests/test_lin_alg.cpp
@@ -0,0 +1,92 @@
+#include <gtest/gtest.h>
+
+#include "mytypes.h"
+#include "allocate.c"
+#include "lin_alg.c"
+#include "list.c"
+#include "tool_box.c"
+#include "vector.c"
+
+#define VEC_SIZE (100)
+
+
+namespace
+{
+    class LinAlgTest : public ::testing::Test
+    {
+        protected:
+            real *a, *res;
+            sparse_matrix *sm;
+
+            LinAlgTest( )
+            {
+                if ( !Allocate_Matrix( &sm, VEC_SIZE, VEC_SIZE ) ||
+                        (a = (real *) malloc( VEC_SIZE * sizeof(real))) == NULL ||
+                        (res = (real *) malloc( VEC_SIZE * sizeof(real))) == NULL )
+                {
+                    throw new std::bad_alloc( );
+                }
+            }
+
+            virtual ~LinAlgTest( )
+            {
+                if ( a != NULL )
+                {
+                    free( a );
+                }
+                if ( res != NULL )
+                {
+                    free( res );
+                }
+                if ( sm != NULL )
+                {
+                    Deallocate_Matrix( sm );
+                }
+
+            }
+
+            virtual void SetUp( )
+            {
+                for ( int i = 0; i < VEC_SIZE; ++i )
+                {
+                    a[i] = 1.0;
+                }
+
+                // set up sparse matrix which is an identity matrix in our case
+                for ( int i = 0; i < sm->n + 1; i++ )
+                {
+                    sm->start[i] = i;
+                }
+
+                for ( int i = 0; i < sm->start[sm->n]; i++ )
+                {
+                    sm->j[i] = i;
+                    sm->val[i] = 1.0;
+                }
+            }
+
+
+            virtual void TearDown( )
+            {
+
+            }
+    };
+
+
+    TEST_F(LinAlgTest, Sparse_MatVec)
+    {
+        Sparse_MatVec( sm, a, res );
+
+        for ( int i = 0; i < VEC_SIZE; ++i )
+        {
+            ASSERT_EQ( res[i], a[i] );
+        }
+    }
+}
+
+
+int main( int argc, char **argv )
+{
+    ::testing::InitGoogleTest( &argc, argv );
+    return RUN_ALL_TESTS( );
+}
diff --git a/sPuReMD/tests/test_spuremd.cpp b/sPuReMD/tests/test_spuremd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ac83d54eec527454cafc3c529e4fc79afc299f3
--- /dev/null
+++ b/sPuReMD/tests/test_spuremd.cpp
@@ -0,0 +1,64 @@
+#include <gtest/gtest.h>
+
+#include "spuremd.h"
+
+
+namespace
+{
+    class SPuReMDTest : public ::testing::Test
+    {
+        protected:
+            void *handle;
+
+            SPuReMDTest ( )
+            {
+            }
+
+            virtual ~SPuReMDTest ( )
+            {
+            }
+
+            virtual void SetUp( )
+            {
+            }
+
+            virtual void TearDown( )
+            {
+                if ( handle != NULL )
+                {
+                    cleanup( handle );
+                }
+            }
+    };
+
+
+    TEST_F(SPuReMDTest, water_6540)
+    {
+        handle = setup( "../data/benchmarks/water/water_6540.pdb", 
+                "../data/benchmarks/water/ffield.water",
+                "../environ/param.gpu.water" );
+
+        ASSERT_EQ( simulate( handle ), SPUREMD_SUCCESS );
+
+        //TODO: check energy after evolving system, e.g., 100 steps
+    }
+
+
+    TEST_F(SPuReMDTest, silica_6000)
+    {
+        handle = setup( "../data/benchmarks/silica/silica_6000.pdb", 
+                "../data/benchmarks/silica/ffield-bio",
+                "../environ/param.gpu.water" );
+
+        ASSERT_EQ( simulate( handle ), SPUREMD_SUCCESS );
+
+        //TODO: check energy after evolving system, e.g., 100 steps
+    }
+}
+
+
+int main( int argc, char **argv )
+{
+    ::testing::InitGoogleTest( &argc, argv );
+    return RUN_ALL_TESTS( );
+}