diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c
index 0080f73c7c28ee6452d37aad5b92ac8f6749b91b..61c5c8a702cacfc14123761756e81ab8717227aa 100644
--- a/sPuReMD/src/charges.c
+++ b/sPuReMD/src/charges.c
@@ -205,7 +205,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
         case SAI_PC:
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
             data->timing.cm_solver_pre_comp +=
-                Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt,
+                sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
                         &workspace->H_app_inv );
 #else
             fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
@@ -259,13 +259,9 @@ static void Compute_Preconditioner_QEq( const reax_system * const system,
 //
 //    if ( ones == NULL )
 //    {
-//        if ( ( ones = (real*) malloc( system->N * sizeof(real)) ) == NULL ||
-//            ( x = (real*) malloc( system->N * sizeof(real)) ) == NULL ||
-//            ( y = (real*) malloc( system->N * sizeof(real)) ) == NULL )
-//        {
-//            fprintf( stderr, "Not enough space for preconditioner computation. Terminating...\n" );
-//            exit( INSUFFICIENT_MEMORY );
-//        }
+//        ones = (real*) smalloc( system->N * sizeof(real), "Compute_Preconditioner_EE::ones" );
+//        x = (real*) smalloc( system->N * sizeof(real), "Compute_Preconditioner_EE::x" );
+//        y = (real*) smalloc( system->N * sizeof(real), "Compute_Preconditioner_EE::y" );
 //
 //        for ( i = 0; i < system->N; ++i )
 //        {
@@ -568,7 +564,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system,
         case SAI_PC:
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
             data->timing.cm_solver_pre_comp +=
-                Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt,
+                sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
                         &workspace->H_app_inv );
 #else
             fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
@@ -686,7 +682,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system,
         case SAI_PC:
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
             data->timing.cm_solver_pre_comp +=
-                Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt,
+                sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full,
                         &workspace->H_app_inv );
 #else
             fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" );
@@ -764,11 +760,8 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
         case DIAG_PC:
             if ( workspace->Hdia_inv == NULL )
             {
-                if ( ( workspace->Hdia_inv = (real *) calloc( Hptr->n, sizeof( real ) ) ) == NULL )
-                {
-                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
+                workspace->Hdia_inv = (real *) scalloc( Hptr->n, sizeof( real ),
+                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
             }
             break;
 
@@ -861,8 +854,9 @@ static void Setup_Preconditioner_QEq( const reax_system * const system,
             break;
 
         case SAI_PC:
-            Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres,
-                    &workspace->H_spar_patt );
+            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
+                    control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -916,11 +910,8 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
         case DIAG_PC:
             if ( workspace->Hdia_inv == NULL )
             {
-                if ( ( workspace->Hdia_inv = (real *) calloc( system->N_cm, sizeof( real ) ) ) == NULL )
-                {
-                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
+                workspace->Hdia_inv = (real *) scalloc( system->N_cm, sizeof( real ),
+                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
             }
             break;
 
@@ -1013,8 +1004,9 @@ static void Setup_Preconditioner_EE( const reax_system * const system,
             break;
 
         case SAI_PC:
-            Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres,
-                    &workspace->H_spar_patt );
+            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
+                    control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -1071,11 +1063,8 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
         case DIAG_PC:
             if ( workspace->Hdia_inv == NULL )
             {
-                if ( ( workspace->Hdia_inv = (real *) calloc( Hptr->n, sizeof( real ) ) ) == NULL )
-                {
-                    fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
+                workspace->Hdia_inv = (real *) scalloc( Hptr->n, sizeof( real ),
+                        "Setup_Preconditioner_QEq::workspace->Hdiv_inv" );
             }
             break;
 
@@ -1167,8 +1156,9 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system,
             break;
 
         case SAI_PC:
-            Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres,
-                    &workspace->H_spar_patt );
+            setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt,
+                    &workspace->H_spar_patt_full, &workspace->H_app_inv,
+                    control->cm_solver_pre_comp_sai_thres );
             break;
 
         default:
@@ -1279,18 +1269,18 @@ static void QEq( reax_system * const system, control_params * const control,
         break;
 
     case CG_S:
-        iters = CG( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+        iters = CG( workspace, control, data, workspace->H, workspace->b_s, control->cm_solver_q_err,
                 workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
                  (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
-        iters += CG( workspace, control, workspace->H, workspace->b_t, control->cm_solver_q_err,
+        iters += CG( workspace, control, data, workspace->H, workspace->b_t, control->cm_solver_q_err,
                 workspace->t[0], FALSE ) + 1;
         break;
 
     case SDM_S:
-        iters = SDM( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+        iters = SDM( workspace, control, data, workspace->H, workspace->b_s, control->cm_solver_q_err,
                 workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
                  (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
-        iters += SDM( workspace,control,  workspace->H, workspace->b_t, control->cm_solver_q_err,
+        iters += SDM( workspace, control, data, workspace->H, workspace->b_t, control->cm_solver_q_err,
                       workspace->t[0], FALSE ) + 1;
         break;
 
@@ -1363,13 +1353,13 @@ static void EE( reax_system * const system, control_params * const control,
         break;
 
     case CG_S:
-        iters = CG( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+        iters = CG( workspace, control, data, workspace->H, workspace->b_s, control->cm_solver_q_err,
                 workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
                  (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
         break;
 
     case SDM_S:
-        iters = SDM( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+        iters = SDM( workspace, control, data, workspace->H, workspace->b_s, control->cm_solver_q_err,
                 workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
                  (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
         break;
@@ -1437,13 +1427,13 @@ static void ACKS2( reax_system * const system, control_params * const control,
         break;
 
     case CG_S:
-        iters = CG( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+        iters = CG( workspace, control, data, workspace->H, workspace->b_s, control->cm_solver_q_err,
                 workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
                  (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
         break;
 
     case SDM_S:
-        iters = SDM( workspace, control, workspace->H, workspace->b_s, control->cm_solver_q_err,
+        iters = SDM( workspace, control, data, workspace->H, workspace->b_s, control->cm_solver_q_err,
                 workspace->s[0], (control->cm_solver_pre_comp_refactor > 0 &&
                  (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ) + 1;
         break;
diff --git a/sPuReMD/src/ffield.c b/sPuReMD/src/ffield.c
index 56d8a310439b94aaace6c11c2ab55d3f64bc09cd..7aab5c5235d7e6e221c91f9d90c66949ec861551 100644
--- a/sPuReMD/src/ffield.c
+++ b/sPuReMD/src/ffield.c
@@ -33,11 +33,14 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     int i, j, k, l, m, n, o, p, cnt;
     real val;
 
-    s = (char*) malloc( sizeof(char) * MAX_LINE );
-    tmp = (char**) malloc( sizeof(char*) * MAX_TOKENS );
+    s = (char*) smalloc( sizeof(char) * MAX_LINE,
+           "Read_Force_Field::s" );
+    tmp = (char**) smalloc( sizeof(char*) * MAX_TOKENS,
+           "Read_Force_Field::tmp" );
     for (i = 0; i < MAX_TOKENS; i++)
     {
-        tmp[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
+        tmp[i] = (char*) smalloc( sizeof(char) * MAX_TOKEN_LEN,
+               "Read_Force_Field::tmp[i]" );
     }
 
     /* reading first header comment */
@@ -57,7 +60,8 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
     }
 
     reax->gp.n_global = n;
-    reax->gp.l = (real*) malloc( sizeof(real) * n );
+    reax->gp.l = (real*) smalloc( sizeof(real) * n,
+           "Read_Force_Field::reax->gp-l" );
 
     /* see mytypes.h for mapping between l[i] and the lambdas used in ff */
     for (i = 0; i < n; i++)
@@ -85,48 +89,65 @@ char Read_Force_Field( FILE* fp, reax_interaction* reax )
 
     /* Allocating structures in reax_interaction */
     reax->sbp = (single_body_parameters*)
-                calloc( reax->num_atom_types, sizeof(single_body_parameters) );
+                scalloc( reax->num_atom_types, sizeof(single_body_parameters),
+                        "Read_Force_Field::reax->sbp" );
     reax->tbp = (two_body_parameters**)
-                calloc( reax->num_atom_types, sizeof(two_body_parameters*) );
+                scalloc( reax->num_atom_types, sizeof(two_body_parameters*),
+                        "Read_Force_Field::reax->tbp" );
     reax->thbp = (three_body_header***)
-                 calloc( reax->num_atom_types, sizeof(three_body_header**) );
+                 scalloc( reax->num_atom_types, sizeof(three_body_header**),
+                         "Read_Force_Field::reax->thbp" );
     reax->hbp = (hbond_parameters***)
-                calloc( reax->num_atom_types, sizeof(hbond_parameters**) );
+                scalloc( reax->num_atom_types, sizeof(hbond_parameters**),
+                        "Read_Force_Field::reax->hbp" );
     reax->fbp = (four_body_header****)
-                calloc( reax->num_atom_types, sizeof(four_body_header***) );
+                scalloc( reax->num_atom_types, sizeof(four_body_header***),
+                        "Read_Force_Field::reax->fbp" );
     tor_flag  = (char****)
-                calloc( reax->num_atom_types, sizeof(char***) );
+                scalloc( reax->num_atom_types, sizeof(char***),
+                        "Read_Force_Field::tor_flag" );
 
     for ( i = 0; i < reax->num_atom_types; i++ )
     {
         reax->tbp[i] = (two_body_parameters*)
-                       calloc( reax->num_atom_types, sizeof(two_body_parameters) );
+                       scalloc( reax->num_atom_types, sizeof(two_body_parameters),
+                               "Read_Force_Field::reax->tbp[i]" );
         reax->thbp[i] = (three_body_header**)
-                        calloc( reax->num_atom_types, sizeof(three_body_header*) );
+                        scalloc( reax->num_atom_types, sizeof(three_body_header*),
+                                "Read_Force_Field::reax->thbp[i]" );
         reax->hbp[i] = (hbond_parameters**)
-                       calloc( reax->num_atom_types, sizeof(hbond_parameters*) );
+                       scalloc( reax->num_atom_types, sizeof(hbond_parameters*),
+                               "Read_Force_Field::reax->hbp[i]" );
         reax->fbp[i] = (four_body_header***)
-                       calloc( reax->num_atom_types, sizeof(four_body_header**) );
+                       scalloc( reax->num_atom_types, sizeof(four_body_header**),
+                               "Read_Force_Field::reax->fbp[i]" );
         tor_flag[i]  = (char***)
-                       calloc( reax->num_atom_types, sizeof(char**) );
+                       scalloc( reax->num_atom_types, sizeof(char**),
+                               "Read_Force_Field::tor_flag[i]" );
 
         for ( j = 0; j < reax->num_atom_types; j++ )
         {
             reax->thbp[i][j] = (three_body_header*)
-                               calloc( reax->num_atom_types, sizeof(three_body_header) );
+                               scalloc( reax->num_atom_types, sizeof(three_body_header),
+                                       "Read_Force_Field::reax->thbp[i][j]" );
             reax->hbp[i][j] = (hbond_parameters*)
-                              calloc( reax->num_atom_types, sizeof(hbond_parameters) );
+                              scalloc( reax->num_atom_types, sizeof(hbond_parameters),
+                                      "Read_Force_Field::reax->hbp[i][j]" );
             reax->fbp[i][j] = (four_body_header**)
-                              calloc( reax->num_atom_types, sizeof(four_body_header*) );
+                              scalloc( reax->num_atom_types, sizeof(four_body_header*),
+                                      "Read_Force_Field::reax->fbp[i][j]" );
             tor_flag[i][j]  = (char**)
-                              calloc( reax->num_atom_types, sizeof(char*) );
+                              scalloc( reax->num_atom_types, sizeof(char*),
+                                      "Read_Force_Field::tor_flag[i][j]" );
 
             for (k = 0; k < reax->num_atom_types; k++)
             {
                 reax->fbp[i][j][k] = (four_body_header*)
-                                     calloc( reax->num_atom_types, sizeof(four_body_header) );
+                                     scalloc( reax->num_atom_types, sizeof(four_body_header),
+                                             "Read_Force_Field::reax->fbp[i][j][k]" );
                 tor_flag[i][j][k]  = (char*)
-                                     calloc( reax->num_atom_types, sizeof(char) );
+                                     scalloc( reax->num_atom_types, sizeof(char),
+                                             "Read_Force_Field::tor_flag[i][j][k]" );
             }
         }
     }
diff --git a/sPuReMD/src/forces.c b/sPuReMD/src/forces.c
index 10e31fdc625428ac3fd6e3df1a521418cdaed18f..4ad3069f78e7fa0158f92c7872eaad6fdc81a139 100644
--- a/sPuReMD/src/forces.c
+++ b/sPuReMD/src/forces.c
@@ -476,11 +476,8 @@ static void Init_Charge_Matrix_Remaining_Entries( reax_system *system,
             break;
 
         case ACKS2_CM:
-            if ( (X_diag = (real*) calloc(system->N, sizeof(real))) == NULL )
-            {
-                fprintf( stderr, "not enough memory for charge matrix. terminating.\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            X_diag = (real*) scalloc( system->N, sizeof(real),
+                    "Init_Charge_Matrix_Remaining_Entries::X_diag" );
 
             H->start[system->N] = *Htop;
             H_sp->start[system->N] = *H_sp_top;
diff --git a/sPuReMD/src/geo_tools.c b/sPuReMD/src/geo_tools.c
index fcde21be8b93a72299b3b7134978d05cef42d83c..6967564503dcf3baeb3a1a2862be0f365e3dd7a0 100644
--- a/sPuReMD/src/geo_tools.c
+++ b/sPuReMD/src/geo_tools.c
@@ -247,12 +247,7 @@ char Read_PDB( char* pdb_file, reax_system* system, control_params *control,
     }
 
     /* allocate memory for tokenizing pdb lines */
-    if ( Allocate_Tokenizer_Space( &s, &s1, &tmp ) == FAILURE )
-    {
-        fprintf( stderr, "Allocate_Tokenizer_Space: not enough memory!" );
-        fprintf( stderr, "terminating...\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
+    Allocate_Tokenizer_Space( &s, &s1, &tmp );
 
     /* read box information */
     if ( Read_Box_Info( system, pdb, PDB ) == FAILURE )
@@ -567,12 +562,16 @@ char Read_BGF( char* bgf_file, reax_system* system, control_params *control,
     }
 
     /* allocate memory for tokenizing biograf file lines */
-    line   = (char*)  malloc( sizeof(char)  * MAX_LINE );
-    backup = (char*)  malloc( sizeof(char)  * MAX_LINE );
-    tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS );
+    line = (char*) smalloc( sizeof(char)  * MAX_LINE,
+           "Read_BGF::line" );
+    backup = (char*) smalloc( sizeof(char)  * MAX_LINE,
+           "Read_BGF::backup" );
+    tokens = (char**) smalloc( sizeof(char*) * MAX_TOKENS,
+           "Read_BGF::tokens" );
     for ( i = 0; i < MAX_TOKENS; i++ )
     {
-        tokens[i] = (char*) malloc( sizeof(char) * MAX_TOKEN_LEN );
+        tokens[i] = (char*) smalloc( sizeof(char) * MAX_TOKEN_LEN,
+               "Read_BGF::tokens[i]" );
     }
 
     /* count number of atoms in the pdb file */
@@ -599,20 +598,26 @@ char Read_BGF( char* bgf_file, reax_system* system, control_params *control,
     fclose( bgf );
 
     /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+    system->atoms = (reax_atom*) scalloc( system->N, sizeof(reax_atom),
+            "Read_BGF::system->atoms" );
 
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
+    workspace->map_serials = (int*) scalloc( MAX_ATOM_ID, sizeof(int),
+            "Read_BGF::workspace->map_serials" );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
     {
         workspace->map_serials[i] = -1;
     }
 
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int**) calloc( system->N, sizeof(int*) );
+    workspace->orig_id = (int*) scalloc( system->N, sizeof(int),
+            "Read_BGF::workspace->orig_id" );
+    workspace->restricted  = (int*) scalloc( system->N, sizeof(int),
+            "Read_BGF::workspace->restricted" );
+    workspace->restricted_list = (int**) scalloc( system->N, sizeof(int*),
+            "Read_BGF::workspace->restricted_list" );
     for ( i = 0; i < system->N; ++i )
     {
-        workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
+        workspace->restricted_list[i] = (int*) scalloc( MAX_RESTRICT, sizeof(int),
+                "Read_BGF::workspace->restricted_list[i]" );
     }
 
     /* start reading and processing bgf file */
diff --git a/sPuReMD/src/grid.c b/sPuReMD/src/grid.c
index 3b7a45ce3c09cb7a7341a8dd78170a6fbf9da73d..9c04c3f287a5558fd82b230694aa5a6a9ef2e86a 100644
--- a/sPuReMD/src/grid.c
+++ b/sPuReMD/src/grid.c
@@ -76,33 +76,54 @@ void Allocate_Space_for_Grid( reax_system *system )
         * (2 * g->spread[1] + 1) * (2 * g->spread[2] + 1) + 3;
 
     /* allocate space for the new grid */
-    g->atoms = (int****) calloc( g->ncell[0], sizeof( int*** ));
-    g->top = (int***) calloc( g->ncell[0], sizeof( int** ));
-    g->mark = (int***) calloc( g->ncell[0], sizeof( int** ));
-    g->start = (int***) calloc( g->ncell[0], sizeof( int** ));
-    g->end = (int***) calloc( g->ncell[0], sizeof( int** ));
-    g->nbrs = (ivec****) calloc( g->ncell[0], sizeof( ivec*** ));
-    g->nbrs_cp = (rvec****) calloc( g->ncell[0], sizeof( rvec*** ));
+    g->atoms = (int****) scalloc( g->ncell[0], sizeof( int*** ),
+            "Allocate_Space_for_Grid::g->atoms" );
+    g->top = (int***) scalloc( g->ncell[0], sizeof( int** ),
+            "Allocate_Space_for_Grid::g->top" );
+    g->mark = (int***) scalloc( g->ncell[0], sizeof( int** ),
+            "Allocate_Space_for_Grid::g->mark" );
+    g->start = (int***) scalloc( g->ncell[0], sizeof( int** ),
+            "Allocate_Space_for_Grid::g->start" );
+    g->end = (int***) scalloc( g->ncell[0], sizeof( int** ),
+            "Allocate_Space_for_Grid::g->end" );
+    g->nbrs = (ivec****) scalloc( g->ncell[0], sizeof( ivec*** ),
+            "Allocate_Space_for_Grid::g->nbrs" );
+    g->nbrs_cp = (rvec****) scalloc( g->ncell[0], sizeof( rvec*** ),
+            "Allocate_Space_for_Grid::g->nbrs_cp" );
 
     for ( i = 0; i < g->ncell[0]; i++ )
     {
-        g->atoms[i] = (int***) calloc( g->ncell[1], sizeof( int** ));
-        g->top [i] = (int**) calloc( g->ncell[1], sizeof( int* ));
-        g->mark[i] = (int**) calloc( g->ncell[1], sizeof( int* ));
-        g->start[i] = (int**) calloc( g->ncell[1], sizeof( int* ));
-        g->end[i] = (int**) calloc( g->ncell[1], sizeof( int* ));
-        g->nbrs[i] = (ivec***) calloc( g->ncell[1], sizeof( ivec** ));
-        g->nbrs_cp[i] = (rvec***) calloc( g->ncell[1], sizeof( rvec** ));
+        g->atoms[i] = (int***) scalloc( g->ncell[1], sizeof( int** ),
+                "Allocate_Space_for_Grid::g->atoms[i]" );
+        g->top [i] = (int**) scalloc( g->ncell[1], sizeof( int* ),
+                "Allocate_Space_for_Grid::g->top[i]" );
+        g->mark[i] = (int**) scalloc( g->ncell[1], sizeof( int* ),
+                "Allocate_Space_for_Grid::g->mark[i]" );
+        g->start[i] = (int**) scalloc( g->ncell[1], sizeof( int* ),
+                "Allocate_Space_for_Grid::g->start[i]" );
+        g->end[i] = (int**) scalloc( g->ncell[1], sizeof( int* ),
+                "Allocate_Space_for_Grid::g->end[i]" );
+        g->nbrs[i] = (ivec***) scalloc( g->ncell[1], sizeof( ivec** ),
+                "Allocate_Space_for_Grid::g->nbrs[i]" );
+        g->nbrs_cp[i] = (rvec***) scalloc( g->ncell[1], sizeof( rvec** ),
+                "Allocate_Space_for_Grid::g->nbrs_cp[i]" );
 
         for ( j = 0; j < g->ncell[1]; j++ )
         {
-            g->atoms[i][j] = (int**) calloc( g->ncell[2], sizeof( int* ));
-            g->top[i][j] = (int*) calloc( g->ncell[2], sizeof( int ));
-            g->mark[i][j] = (int*) calloc( g->ncell[2], sizeof( int ));
-            g->start[i][j] = (int*) calloc( g->ncell[2], sizeof( int ));
-            g->end[i][j] = (int*) calloc( g->ncell[2], sizeof( int ));
-            g->nbrs[i][j] = (ivec**) calloc( g->ncell[2], sizeof( ivec* ));
-            g->nbrs_cp[i][j] = (rvec**) calloc( g->ncell[2], sizeof( rvec* ));
+            g->atoms[i][j] = (int**) scalloc( g->ncell[2], sizeof( int* ),
+                    "Allocate_Space_for_Grid::g->atoms[i][j]" );
+            g->top[i][j] = (int*) scalloc( g->ncell[2], sizeof( int ),
+                    "Allocate_Space_for_Grid::g->top[i][j]" );
+            g->mark[i][j] = (int*) scalloc( g->ncell[2], sizeof( int ),
+                    "Allocate_Space_for_Grid::g->mark[i][j]" );
+            g->start[i][j] = (int*) scalloc( g->ncell[2], sizeof( int ),
+                    "Allocate_Space_for_Grid::g->start[i][j]" );
+            g->end[i][j] = (int*) scalloc( g->ncell[2], sizeof( int ),
+                    "Allocate_Space_for_Grid::g->end[i][j]" );
+            g->nbrs[i][j] = (ivec**) scalloc( g->ncell[2], sizeof( ivec* ),
+                    "Allocate_Space_for_Grid::g->nbrs[i][j]" );
+            g->nbrs_cp[i][j] = (rvec**) scalloc( g->ncell[2], sizeof( rvec* ),
+                    "Allocate_Space_for_Grid::g->nbrs_cp[i][j]" );
 
             for ( k = 0; k < g->ncell[2]; k++ )
             {
@@ -110,8 +131,10 @@ void Allocate_Space_for_Grid( reax_system *system )
                 g->mark[i][j][k] = 0;
                 g->start[i][j][k] = 0;
                 g->end[i][j][k] = 0;
-                g->nbrs[i][j][k] = (ivec*) malloc( g->max_nbrs * sizeof( ivec ) );
-                g->nbrs_cp[i][j][k] = (rvec*) malloc( g->max_nbrs * sizeof( rvec ) );
+                g->nbrs[i][j][k] = (ivec*) smalloc( g->max_nbrs * sizeof( ivec ),
+                       "Allocate_Space_for_Grid::g->nbrs[i][j]][k]" );
+                g->nbrs_cp[i][j][k] = (rvec*) smalloc( g->max_nbrs * sizeof( rvec ),
+                       "Allocate_Space_for_Grid::g->nbrs_cp[i][j]][k]" );
 
                 for ( l = 0; l < g->max_nbrs; ++l )
                 {
@@ -135,7 +158,8 @@ void Allocate_Space_for_Grid( reax_system *system )
         {
             for ( k = 0; k < g->ncell[2]; k++ )
             {
-                g->atoms[i][j][k] = (int*) malloc( g->max_atoms * sizeof( int ) );
+                g->atoms[i][j][k] = (int*) smalloc( g->max_atoms * sizeof( int ),
+                       "Allocate_Space_for_Grid::g->atoms[i][j]][k]" );
 
                 for ( l = 0; l < g->max_atoms; ++l )
                 {
@@ -601,22 +625,31 @@ void Cluster_Atoms( reax_system *system, static_storage *workspace,
 
     num_H = 0;
     g = &( system->g );
-    new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-    orig_id = (int  *) calloc( system->N, sizeof( int ) );
-    f_old = (rvec*) calloc( system->N, sizeof(rvec) );
-
-    s = (real**) calloc( 3, sizeof( real* ) );
-    t = (real**) calloc( 3, sizeof( real* ) );
+    new_atoms = (reax_atom*) scalloc( system->N, sizeof(reax_atom),
+            "Cluster_Atoms::new_atoms" );
+    orig_id = (int  *) scalloc( system->N, sizeof( int ),
+            "Cluster_Atoms::orig_id" );
+    f_old = (rvec*) scalloc( system->N, sizeof(rvec),
+            "Cluster_Atoms::f_old" );
+
+    s = (real**) scalloc( 3, sizeof( real* ),
+            "Cluster_Atoms::s" );
+    t = (real**) scalloc( 3, sizeof( real* ),
+            "Cluster_Atoms::t" );
     for ( i = 0; i < 3; ++i )
     {
-        s[i] = (real *) calloc( system->N, sizeof( real ) );
-        t[i] = (real *) calloc( system->N, sizeof( real ) );
+        s[i] = (real *) scalloc( system->N, sizeof( real ),
+                "Cluster_Atoms::s[i]" );
+        t[i] = (real *) scalloc( system->N, sizeof( real ),
+                "Cluster_Atoms::t[i]" );
     }
 
-    v = (real**) calloc( control->cm_solver_restart + 1, sizeof( real* ) );
+    v = (real**) scalloc( control->cm_solver_restart + 1, sizeof( real* ),
+            "Cluster_Atoms::v" );
     for ( i = 0; i < control->cm_solver_restart + 1; ++i )
     {
-        v[i] = (real *) calloc( system->N, sizeof( real ) );
+        v[i] = (real *) scalloc( system->N, sizeof( real ),
+                "Cluster_Atoms::v[i]" );
     }
 
     top = 0;
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index c572882539ab3a75211d5d93a3cb274126f9e3be..93fed761a7f4233da33ee03fefce7e3320947c14 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -279,26 +279,43 @@ void Init_Workspace( reax_system *system, control_params *control,
     int i;
 
     /* Allocate space for hydrogen bond list */
-    workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) );
+    workspace->hbond_index = (int *) smalloc( system->N * sizeof( int ),
+           "Init_Workspace::workspace->hbond_index" );
 
     /* bond order related storage  */
-    workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Deltap           = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
-
-    workspace->Delta        = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_lp         = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDelta_lp        = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp_temp         = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Clp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->vlpex        = (real *) malloc( system->N * sizeof( real ) );
+    workspace->total_bond_order = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->bond_order" );
+    workspace->Deltap = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Deltap" );
+    workspace->Deltap_boc = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Deltap_boc" );
+    workspace->dDeltap_self = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->dDeltap_self" );
+
+    workspace->Delta = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Delta" );
+    workspace->Delta_lp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Delta_lp" );
+    workspace->Delta_lp_temp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Delta_lp_temp" );
+    workspace->dDelta_lp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->dDelta_lp" );
+    workspace->dDelta_lp_temp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->dDelta_lp_temp" );
+    workspace->Delta_e = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Delta_e" );
+    workspace->Delta_boc = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Delta_boc" );
+    workspace->nlp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->nlp" );
+    workspace->nlp_temp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->nlp_temp" );
+    workspace->Clp = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->Clp" );
+    workspace->CdDelta = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->CdDelta" );
+    workspace->vlpex = (real *) smalloc( system->N * sizeof( real ),
+           "Init_Workspace::workspace->vlpex" );
 
     /* charge method storage */
     switch ( control->charge_method )
@@ -328,22 +345,33 @@ void Init_Workspace( reax_system *system, control_params *control,
     if ( control->cm_solver_pre_comp_type == ICHOLT_PC ||
             control->cm_solver_pre_comp_type == ILUT_PAR_PC )
     {
-        workspace->droptol  = (real *) calloc( system->N_cm, sizeof( real ) );
+        workspace->droptol = (real *) scalloc( system->N_cm, sizeof( real ),
+                "Init_Workspace::workspace->droptol" );
     }
     //TODO: check if unused
-    //workspace->w        = (real *) calloc( cm_lin_sys_size, sizeof( real ) );
+    //workspace->w        = (real *) scalloc( cm_lin_sys_size, sizeof( real ),
+    //"Init_Workspace::workspace->droptol" );
     //TODO: check if unused
-    workspace->b        = (real *) calloc( system->N_cm * 2, sizeof( real ) );
-    workspace->b_s      = (real *) calloc( system->N_cm, sizeof( real ) );
-    workspace->b_t      = (real *) calloc( system->N_cm, sizeof( real ) );
-    workspace->b_prc    = (real *) calloc( system->N_cm * 2, sizeof( real ) );
-    workspace->b_prm    = (real *) calloc( system->N_cm * 2, sizeof( real ) );
-    workspace->s        = (real**) calloc( 5, sizeof( real* ) );
-    workspace->t        = (real**) calloc( 5, sizeof( real* ) );
+    workspace->b = (real *) scalloc( system->N_cm * 2, sizeof( real ),
+            "Init_Workspace::workspace->b" );
+    workspace->b_s = (real *) scalloc( system->N_cm, sizeof( real ),
+            "Init_Workspace::workspace->b_s" );
+    workspace->b_t = (real *) scalloc( system->N_cm, sizeof( real ),
+            "Init_Workspace::workspace->b_t" );
+    workspace->b_prc = (real *) scalloc( system->N_cm * 2, sizeof( real ),
+            "Init_Workspace::workspace->b_prc" );
+    workspace->b_prm = (real *) scalloc( system->N_cm * 2, sizeof( real ),
+            "Init_Workspace::workspace->b_prm" );
+    workspace->s = (real**) scalloc( 5, sizeof( real* ),
+            "Init_Workspace::workspace->s" );
+    workspace->t = (real**) scalloc( 5, sizeof( real* ),
+            "Init_Workspace::workspace->t" );
     for ( i = 0; i < 5; ++i )
     {
-        workspace->s[i] = (real *) calloc( system->N_cm, sizeof( real ) );
-        workspace->t[i] = (real *) calloc( system->N_cm, sizeof( real ) );
+        workspace->s[i] = (real *) scalloc( system->N_cm, sizeof( real ),
+                "Init_Workspace::workspace->s[i]" );
+        workspace->t[i] = (real *) scalloc( system->N_cm, sizeof( real ),
+                "Init_Workspace::workspace->t[i]" );
     }
 
     switch ( control->charge_method )
@@ -405,40 +433,62 @@ void Init_Workspace( reax_system *system, control_params *control,
         /* GMRES storage */
         case GMRES_S:
         case GMRES_H_S:
-            workspace->y  = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
-            workspace->z  = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
-            workspace->g  = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
-            workspace->h  = (real **) calloc( control->cm_solver_restart + 1, sizeof( real*) );
-            workspace->hs = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
-            workspace->hc = (real *)  calloc( control->cm_solver_restart + 1, sizeof( real ) );
-            workspace->rn = (real **) calloc( control->cm_solver_restart + 1, sizeof( real*) );
-            workspace->v  = (real **) calloc( control->cm_solver_restart + 1, sizeof( real*) );
+            workspace->y = (real *) scalloc( control->cm_solver_restart + 1, sizeof( real ),
+                    "Init_Workspace::workspace->y" );
+            workspace->z = (real *) scalloc( control->cm_solver_restart + 1, sizeof( real ),
+                    "Init_Workspace::workspace->z" );
+            workspace->g = (real *) scalloc( control->cm_solver_restart + 1, sizeof( real ),
+                    "Init_Workspace::workspace->g" );
+            workspace->h = (real **) scalloc( control->cm_solver_restart + 1, sizeof( real*),
+                    "Init_Workspace::workspace->h" );
+            workspace->hs = (real *) scalloc( control->cm_solver_restart + 1, sizeof( real ),
+                    "Init_Workspace::workspace->hs" );
+            workspace->hc = (real *) scalloc( control->cm_solver_restart + 1, sizeof( real ),
+                    "Init_Workspace::workspace->hc" );
+            workspace->rn = (real **) scalloc( control->cm_solver_restart + 1, sizeof( real*),
+                    "Init_Workspace::workspace->rn" );
+            workspace->v = (real **) scalloc( control->cm_solver_restart + 1, sizeof( real*),
+                    "Init_Workspace::workspace->v" );
 
             for ( i = 0; i < control->cm_solver_restart + 1; ++i )
             {
-                workspace->h[i]  = (real *) calloc( control->cm_solver_restart + 1, sizeof( real ) );
-                workspace->rn[i] = (real *) calloc( system->N_cm * 2, sizeof( real ) );
-                workspace->v[i]  = (real *) calloc( system->N_cm, sizeof( real ) );
+                workspace->h[i] = (real *) scalloc( control->cm_solver_restart + 1, sizeof( real ),
+                        "Init_Workspace::workspace->h[i]" );
+                workspace->rn[i] = (real *) scalloc( system->N_cm * 2, sizeof( real ),
+                        "Init_Workspace::workspace->rn[i]" );
+                workspace->v[i] = (real *) scalloc( system->N_cm, sizeof( real ),
+                        "Init_Workspace::workspace->v[i]" );
             }
 
-            workspace->r = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->d = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->q = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->p = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->r = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->r" );
+            workspace->d = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->d" );
+            workspace->q = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->q" );
+            workspace->p = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->p" );
             break;
 
         /* CG storage */
         case CG_S:
-            workspace->r = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->d = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->q = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->p = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->r = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->r" );
+            workspace->d = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->d" );
+            workspace->q = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->q" );
+            workspace->p = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->p" );
             break;
 
         case SDM_S:
-            workspace->r = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->d = (real *) calloc( system->N_cm, sizeof( real ) );
-            workspace->q = (real *) calloc( system->N_cm, sizeof( real ) );
+            workspace->r = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->r" );
+            workspace->d = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->d" );
+            workspace->q = (real *) scalloc( system->N_cm, sizeof( real ),
+                    "Init_Workspace::workspace->q" );
             break;
 
         default:
@@ -448,19 +498,25 @@ void Init_Workspace( reax_system *system, control_params *control,
     }
 
     /* integrator storage */
-    workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->a = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->a" );
+    workspace->f_old = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_old" );
+    workspace->v_const = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->v_const" );
 
 #ifdef _OPENMP
-    workspace->f_local = (rvec *) malloc( control->num_threads * system->N * sizeof( rvec ) );
+    workspace->f_local = (rvec *) smalloc( control->num_threads * system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_local" );
 #endif
 
     /* storage for analysis */
     if ( control->molec_anal || control->diffusion_coef )
     {
-        workspace->mark = (int *) calloc( system->N, sizeof(int) );
-        workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
+        workspace->mark = (int *) scalloc( system->N, sizeof(int),
+                "Init_Workspace::workspace->mark" );
+        workspace->old_mark = (int *) scalloc( system->N, sizeof(int),
+                "Init_Workspace::workspace->old_mark" );
     }
     else
     {
@@ -469,7 +525,8 @@ void Init_Workspace( reax_system *system, control_params *control,
 
     if ( control->diffusion_coef )
     {
-        workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
+        workspace->x_old = (rvec *) scalloc( system->N, sizeof( rvec ),
+                "Init_Workspace::workspace->x_old" );
     }
     else
     {
@@ -477,20 +534,34 @@ void Init_Workspace( reax_system *system, control_params *control,
     }
 
 #ifdef TEST_FORCES
-    workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->dDelta = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->dDelta" );
+    workspace->f_ele = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_ele" );
+    workspace->f_vdw = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_vdw" );
+    workspace->f_bo = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_bo" );
+    workspace->f_be = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_be" );
+    workspace->f_lp = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_lp" );
+    workspace->f_ov = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_ov" );
+    workspace->f_un = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_un" );
+    workspace->f_ang = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_ang" );
+    workspace->f_coa = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_coa" );
+    workspace->f_pen = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_pen" );
+    workspace->f_hb = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_hb" );
+    workspace->f_tor = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_tor" );
+    workspace->f_con = (rvec *) smalloc( system->N * sizeof( rvec ),
+           "Init_Workspace::workspace->f_con" );
 #endif
 
     workspace->realloc.num_far = -1;
@@ -528,8 +599,10 @@ void Init_Lists( reax_system *system, control_params *control,
 
     Generate_Neighbor_Lists(system, control, data, workspace, lists, out_control);
     Htop = 0;
-    hb_top = (int*) calloc( system->N, sizeof(int) );
-    bond_top = (int*) calloc( system->N, sizeof(int) );
+    hb_top = (int*) scalloc( system->N, sizeof(int),
+            "Init_Lists::hb_top" );
+    bond_top = (int*) scalloc( system->N, sizeof(int),
+            "Init_Lists::bond_top" );
     num_3body = 0;
     Estimate_Storage_Sizes( system, control, lists, &Htop,
             hb_top, bond_top, &num_3body );
@@ -1008,7 +1081,9 @@ void Finalize_Workspace( reax_system *system, control_params *control,
     }
     if ( control->cm_solver_pre_comp_type == SAI_PC )
     {
+        Deallocate_Matrix( workspace->H_full );
         Deallocate_Matrix( workspace->H_spar_patt );
+        Deallocate_Matrix( workspace->H_spar_patt_full );
         Deallocate_Matrix( workspace->H_app_inv );
     }
 
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index c6f56ed426d5095727c0833d5c2662a49fb568e3..110c43a3f747ac15e20bb0dc7958a9354963dbfd 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -148,11 +148,8 @@ void Sort_Matrix_Rows( sparse_matrix * const A )
     //    #pragma omp parallel default(none) private(i, j, si, ei, temp) shared(stderr)
 #endif
     {
-        if ( ( temp = (sparse_matrix_entry *) malloc( A->n * sizeof(sparse_matrix_entry)) ) == NULL )
-        {
-            fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" );
-            exit( INSUFFICIENT_MEMORY );
-        }
+        temp = (sparse_matrix_entry *) smalloc( A->n * sizeof(sparse_matrix_entry),
+                                                "Sort_Matrix_Rows::temp" );
 
         /* sort each row of A using column indices */
 #ifdef _OPENMP
@@ -259,8 +256,9 @@ static void compute_full_sparse_matrix( const sparse_matrix * const A,
  * Assumptions:
  *   A has non-zero diagonals
  *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
-void Setup_Sparsity_Pattern( const sparse_matrix * const A,
-                             const real filter, sparse_matrix ** A_spar_patt )
+void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix ** A_full,
+                                  sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full,
+                                  sparse_matrix ** A_app_inv, const real filter )
 {
     int i, pj, size;
     real min, max, threshold, val;
@@ -286,7 +284,7 @@ void Setup_Sparsity_Pattern( const sparse_matrix * const A,
         }
     }
 
-    // find min and max element of the matrix
+    /* find min and max elements of matrix */
     for ( i = 0; i < A->n; ++i )
     {
         for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
@@ -312,25 +310,8 @@ void Setup_Sparsity_Pattern( const sparse_matrix * const A,
     }
 
     threshold = min + (max - min) * (1.0 - filter);
-    // calculate the nnz of the sparsity pattern
-    //    for ( size = 0, i = 0; i < A->n; ++i )
-    //    {
-    //        for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
-    //        {
-    //            if ( threshold <= A->val[pj] )
-    //                size++;
-    //        }
-    //    }
-    //
-    //    if ( Allocate_Matrix( A_spar_patt, A->n, size ) == NULL )
-    //    {
-    //        fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" );
-    //        exit( INSUFFICIENT_MEMORY );
-    //    }
 
-    //(*A_spar_patt)->start[(*A_spar_patt)->n] = size;
-
-    // fill the sparsity pattern
+    /* fill sparsity pattern */
     for ( size = 0, i = 0; i < A->n; ++i )
     {
         (*A_spar_patt)->start[i] = size;
@@ -346,6 +327,17 @@ void Setup_Sparsity_Pattern( const sparse_matrix * const A,
         }
     }
     (*A_spar_patt)->start[A->n] = size;
+
+    compute_full_sparse_matrix( A, A_full );
+    compute_full_sparse_matrix( *A_spar_patt, A_spar_patt_full );
+
+    /* A_app_inv has the same sparsity pattern
+     * as A_spar_patt_full (omit non-zero values) */
+    if ( Allocate_Matrix( A_app_inv, (*A_spar_patt_full)->n, (*A_spar_patt_full)->m ) == FAILURE )
+    {
+        fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
+        exit( INSUFFICIENT_MEMORY );
+    }
 }
 
 void Calculate_Droptol( const sparse_matrix * const A,
@@ -371,11 +363,8 @@ void Calculate_Droptol( const sparse_matrix * const A,
              * overhead per Sparse_MatVec call*/
             if ( droptol_local == NULL )
             {
-                if ( (droptol_local = (real*) malloc( omp_get_num_threads() * A->n * sizeof(real))) == NULL )
-                {
-                    fprintf( stderr, "Not enough space for droptol. Terminating...\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
+                droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real),
+                                                 "Calculate_Droptol::droptol_local" );
             }
         }
 
@@ -569,15 +558,16 @@ real SuperLU_Factorize( const sparse_matrix * const A,
     {
         SUPERLU_ABORT("Malloc fails for part_super__h[].");
     }
-    if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL )
-            || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL )
-            || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL )
-            || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL )
-            || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) )
-    {
-        fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
+    a = (real*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(real),
+                         "SuperLU_Factorize::a" );
+    asub = (int_t*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(int_t),
+                             "SuperLU_Factorize::asub" );
+    xa = (int_t*) smalloc( (A->n + 1) * sizeof(int_t),
+                           "SuperLU_Factorize::xa" );
+    Ltop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
+                                    "SuperLU_Factorize::Ltop" );
+    Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
+                                    "SuperLU_Factorize::Utop" );
     if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE )
     {
         fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
@@ -824,13 +814,12 @@ real ICHOLT( const sparse_matrix * const A, const real * const droptol,
 
     start = Get_Time( );
 
-    if ( ( Utop = (unsigned int*) malloc((A->n + 1) * sizeof(unsigned int)) ) == NULL ||
-            ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL ||
-            ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "[ICHOLT] Not enough memory for preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
+    Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int),
+                                    "ICHOLT::Utop" );
+    tmp_j = (int*) smalloc( A->n * sizeof(int),
+                            "ICHOLT::Utop" );
+    tmp_val = (real*) smalloc( A->n * sizeof(real),
+                               "ICHOLT::Utop" );
 
     // clear variables
     Ltop = 0;
@@ -976,10 +965,10 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
 
     start = Get_Time( );
 
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL )
+    D = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D" );
+    D_inv = (real*) smalloc( A->n * sizeof(real), "ICHOL_PAR::D_inv" );
+    Utop = (int*) smalloc( (A->n + 1) * sizeof(int), "ICHOL_PAR::Utop" );
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
     {
         fprintf( stderr, "not enough memory for ICHOL_PAR preconditioning matrices. terminating.\n" );
         exit( INSUFFICIENT_MEMORY );
@@ -1154,9 +1143,9 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
 
     start = Get_Time( );
 
-    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE ||
-            ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
+    D = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D" );
+    D_inv = (real*) smalloc( A->n * sizeof(real), "ILU_PAR::D_inv" );
+    if ( Allocate_Matrix( &DAD, A->n, A->m ) == FAILURE )
     {
         fprintf( stderr, "[ILU_PAR] Not enough memory for preconditioning matrices. Terminating.\n" );
         exit( INSUFFICIENT_MEMORY );
@@ -1377,12 +1366,8 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
         exit( INSUFFICIENT_MEMORY );
     }
 
-    if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL ||
-            ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL )
-    {
-        fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
+    D = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D" );
+    D_inv = (real*) smalloc( A->n * sizeof(real), "ILUT_PAR::D_inv" );
 
 #ifdef _OPENMP
     #pragma omp parallel for schedule(static) \
@@ -1616,7 +1601,13 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
 
 
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
-real Sparse_Approx_Inverse( const sparse_matrix * const A,
+/* Compute M^{1} \approx A which minimizes
+ *
+ * A: symmetric, sparse matrix, stored in full CSR format
+ * A_spar_patt: sparse matrix used as template sparsity pattern
+ *   for approximating the inverse, stored in full CSR format
+ * A_app_inv: approximate inverse to A, stored in full CSR format (result) */
+real sparse_approx_inverse( const sparse_matrix * const A,
                             const sparse_matrix * const A_spar_patt,
                             sparse_matrix ** A_app_inv )
 {
@@ -1626,28 +1617,17 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A,
     int *pos_x, *pos_y;
     real start;
     real *e_j, *dense_matrix;
-    sparse_matrix *A_full = NULL, *A_spar_patt_full = NULL;
     char *X, *Y;
 
     start = Get_Time( );
 
-    // Get A and A_spar_patt full matrices
-    compute_full_sparse_matrix( A, &A_full );
-    compute_full_sparse_matrix( A_spar_patt, &A_spar_patt_full );
-
-    // A_app_inv will be the same as A_spar_patt_full except the val array
-    if ( Allocate_Matrix( A_app_inv, A_spar_patt_full->n, A_spar_patt_full->m ) == FAILURE )
-    {
-        fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
-
-    (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt_full->start[A_spar_patt_full->n];
+    (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n];
 
 #ifdef _OPENMP
     #pragma omp parallel default(none) \
-    shared(A_full, A_spar_patt_full) private(i, k, pj, j_temp, identity_pos, \
-            N, M, d_i, d_j, m, n, nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix)
+    private(i, k, pj, j_temp, identity_pos, N, M, d_i, d_j, m, n, \
+            nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) \
+    shared(A_app_inv, stderr)
 #endif
     {
         X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" );
@@ -1666,16 +1646,16 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A,
 #ifdef _OPENMP
         #pragma omp for schedule(static)
 #endif
-        for ( i = 0; i < A_spar_patt_full->n; ++i )
+        for ( i = 0; i < A_spar_patt->n; ++i )
         {
             N = 0;
             M = 0;
 
             // find column indices of nonzeros (which will be the columns indices of the dense matrix)
-            for ( pj = A_spar_patt_full->start[i]; pj < A_spar_patt_full->start[i + 1]; ++pj )
+            for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->start[i + 1]; ++pj )
             {
 
-                j_temp = A_spar_patt_full->j[pj];
+                j_temp = A_spar_patt->j[pj];
 
                 Y[j_temp] = 1;
                 pos_y[j_temp] = N;
@@ -1683,16 +1663,16 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A,
 
                 // for each of those indices
                 // search through the row of full A of that index
-                for ( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k )
+                for ( k = A->start[j_temp]; k < A->start[j_temp + 1]; ++k )
                 {
                     // and accumulate the nonzero column indices to serve as the row indices of the dense matrix
-                    X[A_full->j[k]] = 1;
+                    X[A->j[k]] = 1;
                 }
             }
 
             // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix
             identity_pos = M;
-            for ( k = 0; k < A_full->n; k++)
+            for ( k = 0; k < A->n; k++)
             {
                 if ( X[k] != 0 )
                 {
@@ -1718,12 +1698,12 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A,
                     dense_matrix[d_i * N + d_j] = 0.0;
                 }
                 // change the value if any of the column indices is seen
-                for ( d_j = A_full->start[pos_x[d_i]];
-                        d_j < A_full->start[pos_x[d_i + 1]]; ++d_j )
+                for ( d_j = A->start[pos_x[d_i]];
+                        d_j < A->start[pos_x[d_i + 1]]; ++d_j )
                 {
-                    if ( Y[A_full->j[d_j]] == 1 )
+                    if ( Y[A->j[d_j]] == 1 )
                     {
-                        dense_matrix[d_i * N + pos_y[A_full->j[d_j]]] = A_full->val[d_j];
+                        dense_matrix[d_i * N + pos_y[A->j[d_j]]] = A->val[d_j];
                     }
                 }
 
@@ -1763,11 +1743,11 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A,
             // print_matrix( "Least squares solution", n, nrhs, b, ldb );
 
             // accumulate the resulting vector to build A_app_inv
-            (*A_app_inv)->start[i] = A_spar_patt_full->start[i];
-            for ( k = A_spar_patt_full->start[i]; k < A_spar_patt_full->start[i + 1]; ++k)
+            (*A_app_inv)->start[i] = A_spar_patt->start[i];
+            for ( k = A_spar_patt->start[i]; k < A_spar_patt->start[i + 1]; ++k)
             {
-                (*A_app_inv)->j[k] = A_spar_patt_full->j[k];
-                (*A_app_inv)->val[k] = e_j[k - A_spar_patt_full->start[i]];
+                (*A_app_inv)->j[k] = A_spar_patt->j[k];
+                (*A_app_inv)->val[k] = e_j[k - A_spar_patt->start[i]];
             }
 
             //empty variables that will be used next iteration
@@ -1788,15 +1768,12 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A,
         sfree( X, "Sparse_Approx_Inverse::X" );
     }
 
-    Deallocate_Matrix( A_full );
-    Deallocate_Matrix( A_spar_patt_full );
-
     return Get_Timing_Info( start );
 }
 #endif
 
 
-/* sparse matrix-vector product Ax=b
+/* sparse matrix-vector product Ax = b
  * where:
  *   A: lower triangular matrix, stored in CSR format
  *   x: vector
@@ -1822,10 +1799,8 @@ static void Sparse_MatVec( const sparse_matrix * const A,
          * overhead per Sparse_MatVec call*/
         if ( b_local == NULL )
         {
-            if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL )
-            {
-                exit( INSUFFICIENT_MEMORY );
-            }
+            b_local = (real*) smalloc( omp_get_num_threads() * n * sizeof(real),
+                                       "Sparse_MatVec::b_local" );
         }
     }
 
@@ -1885,7 +1860,7 @@ static void Sparse_MatVec_full( const sparse_matrix * const A,
     Vector_MakeZero( b, A->n );
 
 #ifdef _OPENMP
-    #pragma omp for schedule(static) default(none) private(i, j)
+    #pragma omp for schedule(static)
 #endif
     for ( i = 0; i < A->n; ++i )
     {
@@ -1906,11 +1881,8 @@ void Transpose( const sparse_matrix * const A, sparse_matrix * const A_t )
 {
     unsigned int i, j, pj, *A_t_top;
 
-    if ( (A_t_top = (unsigned int*) calloc( A->n + 1, sizeof(unsigned int))) == NULL )
-    {
-        fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
+    A_t_top = (unsigned int*) scalloc( A->n + 1, sizeof(unsigned int),
+                                       "Transpose::A_t_top" );
 
     memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );
 
@@ -2087,22 +2059,18 @@ void tri_solve_level_sched( const sparse_matrix * const LU,
 
         if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
         {
-            if ( (row_levels = (unsigned int*) malloc((size_t)N * sizeof(unsigned int))) == NULL
-                    || (level_rows = (unsigned int*) malloc((size_t)N * sizeof(unsigned int))) == NULL
-                    || (level_rows_cnt = (unsigned int*) malloc((size_t)(N + 1) * sizeof(unsigned int))) == NULL )
-            {
-                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            row_levels = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int),
+                                                  "tri_solve_level_sched::row_levels" );
+            level_rows = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int),
+                                                  "tri_solve_level_sched::level_rows" );
+            level_rows_cnt = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int),
+                             "tri_solve_level_sched::level_rows_cnt" );
         }
 
         if ( top == NULL )
         {
-            if ( (top = (unsigned int*) malloc((size_t)(N + 1) * sizeof(unsigned int))) == NULL )
-            {
-                fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            top = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int),
+                                           "tri_solve_level_sched::top" );
         }
 
         /* find levels (row dependencies in substitutions) */
@@ -2338,12 +2306,10 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri )
             }
         }
 
-        if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL ||
-                (conflict_local = (unsigned int*) malloc(sizeof(unsigned int) * A->n)) == NULL )
-        {
-            fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
-            exit( INSUFFICIENT_MEMORY );
-        }
+        fb_color = (int*) smalloc( sizeof(int) * MAX_COLOR,
+                                   "graph_coloring::fb_color" );
+        conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n,
+                         "graph_coloring::fb_color" );
 
 #ifdef _OPENMP
         #pragma omp barrier
@@ -2526,11 +2492,7 @@ static void permute_vector( real * const x, const unsigned int n, const int inve
     {
         if ( x_p == NULL )
         {
-            if ( (x_p = (real*) malloc(sizeof(real) * n)) == NULL )
-            {
-                fprintf( stderr, "not enough memory for permuting vector. terminating.\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            x_p = (real*) smalloc( sizeof(real) * n, "permute_vector::x_p" );
         }
 
         if ( invert_map == TRUE )
@@ -2717,16 +2679,25 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
 #endif
 
         /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */
-        if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (to_color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (conflict_cnt = (unsigned int*) malloc(sizeof(unsigned int) * (num_thread + 1))) == NULL ||
-                (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (color_top = (unsigned int*) malloc(sizeof(unsigned int) * (H->n + 1))) == NULL ||
-                (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (permuted_row_col_inv = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
-                (y_p = (real*) malloc(sizeof(real) * H->n)) == NULL ||
-                (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
+        color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
+                                         "setup_graph_coloring::color" );
+        to_color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
+                                            "setup_graph_coloring::to_color" );
+        conflict = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
+                                            "setup_graph_coloring::conflict" );
+        conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (num_thread + 1),
+                                                "setup_graph_coloring::conflict_cnt" );
+        recolor = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
+                                           "setup_graph_coloring::recolor" );
+        color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (H->n + 1),
+                                             "setup_graph_coloring::color_top" );
+        permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
+                           "setup_graph_coloring::premuted_row_col" );
+        permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * H->n,
+                               "setup_graph_coloring::premuted_row_col_inv" );
+        y_p = (real*) smalloc( sizeof(real) * H->n,
+                               "setup_graph_coloring::y_p" );
+        if ( (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
                 (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
         {
             fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
@@ -2773,27 +2744,15 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
     {
         if ( Dinv_b == NULL )
         {
-            if ( (Dinv_b = (real*) malloc(sizeof(real) * R->n)) == NULL )
-            {
-                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            Dinv_b = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::Dinv_b" );
         }
         if ( rp == NULL )
         {
-            if ( (rp = (real*) malloc(sizeof(real) * R->n)) == NULL )
-            {
-                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            rp = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp" );
         }
         if ( rp2 == NULL )
         {
-            if ( (rp2 = (real*) malloc(sizeof(real) * R->n)) == NULL )
-            {
-                fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                exit( INSUFFICIENT_MEMORY );
-            }
+            rp2 = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp2" );
         }
     }
 
@@ -2969,11 +2928,8 @@ static void apply_preconditioner( const static_storage * const workspace, const
             {
                 if ( Dinv_L == NULL )
                 {
-                    if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL )
-                    {
-                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                        exit( INSUFFICIENT_MEMORY );
-                    }
+                    Dinv_L = (real*) smalloc( sizeof(real) * workspace->L->n,
+                                              "apply_preconditioner::Dinv_L" );
                 }
             }
 
@@ -2998,11 +2954,8 @@ static void apply_preconditioner( const static_storage * const workspace, const
             {
                 if ( Dinv_U == NULL )
                 {
-                    if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL )
-                    {
-                        fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" );
-                        exit( INSUFFICIENT_MEMORY );
-                    }
+                    Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n,
+                                              "apply_preconditioner::Dinv_U" );
                 }
             }
 
@@ -3043,42 +2996,50 @@ int GMRES( const static_storage * const workspace, const control_params * const
            const real tol, real * const x, const int fresh_pre )
 {
     int i, j, k, itr, N, g_j, g_itr;
-    real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start;
+    real cc, tmp1, tmp2, temp, ret_temp, bnorm;
+    real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;
 
     N = H->n;
     g_j = 0;
     g_itr = 0;
 
 #ifdef _OPENMP
-    #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \
-    shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr)
+    #pragma omp parallel default(none) \
+    private(i, j, k, itr, bnorm, ret_temp, t_start) \
+    shared(N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) \
+    reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
 #endif
     {
         j = 0;
         itr = 0;
+        t_ortho = 0.0;
+        t_pa = 0.0;
+        t_spmv = 0.0;
+        t_ts = 0.0;
+        t_vops = 0.0;
 
-        time_start = Get_Time( );
+        t_start = Get_Time( );
         bnorm = Norm( b, N );
-        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+        t_vops += Get_Timing_Info( t_start );
 
         /* GMRES outer-loop */
         for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
         {
             /* calculate r0 */
-            time_start = Get_Time( );
+            t_start = Get_Time( );
             Sparse_MatVec( H, x, workspace->b_prm );
-            data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
+            t_spmv += Get_Timing_Info( t_start );
 
-            time_start = Get_Time( );
+            t_start = Get_Time( );
             Vector_Sum( workspace->b_prc, 1., b, -1., workspace->b_prm, N );
-            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+            t_vops += Get_Timing_Info( t_start );
 
-            time_start = Get_Time( );
+            t_start = Get_Time( );
             apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[0],
                                   itr == 0 ? fresh_pre : FALSE );
-            data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
+            t_pa += Get_Timing_Info( t_start );
 
-            time_start = Get_Time( );
+            t_start = Get_Time( );
             ret_temp = Norm( workspace->v[0], N );
 
 #ifdef _OPENMP
@@ -3089,24 +3050,24 @@ int GMRES( const static_storage * const workspace, const control_params * const
             }
 
             Vector_Scale( workspace->v[0], 1. / ret_temp, workspace->v[0], N );
-            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+            t_vops += Get_Timing_Info( t_start );
 
             /* GMRES inner-loop */
             for ( j = 0; j < control->cm_solver_restart && FABS(workspace->g[j]) / bnorm > tol; j++ )
             {
                 /* matvec */
-                time_start = Get_Time( );
+                t_start = Get_Time( );
                 Sparse_MatVec( H, workspace->v[j], workspace->b_prc );
-                data->timing.cm_solver_spmv += Get_Timing_Info( time_start );
+                t_spmv += Get_Timing_Info( t_start );
 
-                time_start = Get_Time( );
+                t_start = Get_Time( );
                 apply_preconditioner( workspace, control, workspace->b_prc, workspace->v[j + 1], FALSE );
-                data->timing.cm_solver_pre_app += Get_Timing_Info( time_start );
+                t_pa += Get_Timing_Info( t_start );
 
 //                if ( control->cm_solver_pre_comp_type == DIAG_PC )
 //                {
                 /* apply modified Gram-Schmidt to orthogonalize the new residual */
-                time_start = Get_Time( );
+                t_start = Get_Time( );
                 for ( i = 0; i <= j; i++ )
                 {
                     ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N );
@@ -3121,7 +3082,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N );
 
                 }
-                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+                t_vops += Get_Timing_Info( t_start );
 //                }
 //                else
 //                {
@@ -3131,7 +3092,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
 //
 //
 //                    {
-//                        time_start = Get_Time( );
+//                        t_start = Get_Time( );
 //                    }
 //
 //                    #pragma omp single
@@ -3159,11 +3120,11 @@ int GMRES( const static_storage * const workspace, const control_params * const
 //
 //
 //                    {
-//                        data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+//                        t_vops += Get_Timing_Info( t_start );
 //                    }
 //                }
 
-                time_start = Get_Time( );
+                t_start = Get_Time( );
                 ret_temp = Norm( workspace->v[j + 1], N );
 
 #ifdef _OPENMP
@@ -3175,9 +3136,9 @@ int GMRES( const static_storage * const workspace, const control_params * const
 
                 Vector_Scale( workspace->v[j + 1],
                               1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N );
-                data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+                t_vops += Get_Timing_Info( t_start );
 
-                time_start = Get_Time( );
+                t_start = Get_Time( );
 //                    if ( control->cm_solver_pre_comp_type == NONE_PC ||
 //                            control->cm_solver_pre_comp_type == DIAG_PC )
 //                    {
@@ -3234,7 +3195,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     workspace->g[j + 1] = tmp2;
 
                 }
-                data->timing.cm_solver_orthog += Get_Timing_Info( time_start );
+                t_ortho += Get_Timing_Info( t_start );
 
 #ifdef _OPENMP
                 #pragma omp barrier
@@ -3242,7 +3203,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
             }
 
             /* solve Hy = g: H is now upper-triangular, do back-substitution */
-            time_start = Get_Time( );
+            t_start = Get_Time( );
 #ifdef _OPENMP
             #pragma omp single
 #endif
@@ -3258,10 +3219,10 @@ int GMRES( const static_storage * const workspace, const control_params * const
                     workspace->y[i] = temp / workspace->h[i][i];
                 }
             }
-            data->timing.cm_solver_tri_solve += Get_Timing_Info( time_start );
+            t_ts += Get_Timing_Info( t_start );
 
             /* update x = x_0 + Vy */
-            time_start = Get_Time( );
+            t_start = Get_Time( );
             Vector_MakeZero( workspace->p, N );
 
             for ( i = 0; i < j; i++ )
@@ -3270,7 +3231,7 @@ int GMRES( const static_storage * const workspace, const control_params * const
             }
 
             Vector_Add( x, 1., workspace->p, N );
-            data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start );
+            t_vops += Get_Timing_Info( t_start );
 
             /* stopping condition */
             if ( FABS(workspace->g[j]) / bnorm <= tol )
@@ -3288,9 +3249,23 @@ int GMRES( const static_storage * const workspace, const control_params * const
         }
     }
 
+#ifdef _OPENMP
+    data->timing.cm_solver_orthog += t_ortho / control->num_threads;
+    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
+    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
+    data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
+    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
+#else
+    data->timing.cm_solver_orthog += t_ortho;
+    data->timing.cm_solver_pre_app += t_pa;
+    data->timing.cm_solver_spmv += t_spmv;
+    data->timing.cm_solver_tri_solve += t_ts;
+    data->timing.cm_solver_vector_ops += t_vops;
+#endif
+
     if ( g_itr >= control->cm_solver_max_iters )
     {
-        fprintf( stderr, "GMRES convergence failed\n" );
+        fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
         return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
     }
 
@@ -3303,203 +3278,239 @@ int GMRES_HouseHolder( const static_storage * const workspace,
                        const sparse_matrix * const H, const real * const b, real tol,
                        real * const x, const int fresh_pre )
 {
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
+    int i, j, k, itr, N, g_j, g_itr;
+    real cc, tmp1, tmp2, temp, bnorm, ret_temp;
     real v[10000], z[control->cm_solver_restart + 2][10000], w[control->cm_solver_restart + 2];
     real u[control->cm_solver_restart + 2][10000];
+    real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops;
 
     j = 0;
     N = H->n;
-    bnorm = Norm( b, N );
 
-    /* apply the diagonal pre-conditioner to rhs */
-    for ( i = 0; i < N; ++i )
+#ifdef _OPENMP
+    #pragma omp parallel default(none) \
+    private(i, j, k, itr, bnorm, ret_temp, t_start) \
+    shared(v, z, w, u, tol, N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) \
+    reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops)
+#endif
     {
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];
-    }
+        t_start = Get_Time( );
+        bnorm = Norm( b, N );
+        t_vops += Get_Timing_Info( t_start );
 
-    // memset( x, 0, sizeof(real) * N );
+        // memset( x, 0, sizeof(real) * N );
 
-    /* GMRES outer-loop */
-    for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
-    {
-        /* compute z = r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );
-        for ( i = 0; i < N; ++i )
+        /* GMRES outer-loop */
+        for ( itr = 0; itr < control->cm_solver_max_iters; ++itr )
         {
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
-        }
-        Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
+            /* compute z = r0 */
+            t_start = Get_Time( );
+            Sparse_MatVec( H, x, workspace->b_prm );
+            t_spmv += Get_Timing_Info( t_start );
 
-        Vector_MakeZero( w, control->cm_solver_restart + 1 );
-        w[0] = Norm( z[0], N );
+            t_start = Get_Time( );
+            Vector_Sum( workspace->b_prc, 1.,  workspace->b, -1., workspace->b_prm, N );
+            t_vops += Get_Timing_Info( t_start );
 
-        Vector_Copy( u[0], z[0], N );
-        u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
-        Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
+            t_start = Get_Time( );
+            apply_preconditioner( workspace, control, workspace->b_prc, z[0], fresh_pre );
+            t_pa += Get_Timing_Info( t_start );
 
-        w[0] *= ( u[0][0] < 0.0 ?  1 : -1 );
-        // fprintf( stderr, "\n\n%12.6f\n", w[0] );
+            t_start = Get_Time( );
+            Vector_MakeZero( w, control->cm_solver_restart + 1 );
+            w[0] = Norm( z[0], N );
 
-        /* GMRES inner-loop */
-        for ( j = 0; j < control->cm_solver_restart && FABS( w[j] ) / bnorm > tol; j++ )
-        {
-            /* compute v_j */
-            Vector_Scale( z[j], -2 * u[j][j], u[j], N );
-            z[j][j] += 1.; /* due to e_j */
+            Vector_Copy( u[0], z[0], N );
+            u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
+            Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
+
+            w[0] *= ( u[0][0] < 0.0 ?  1 : -1 );
+            t_vops += Get_Timing_Info( t_start );
 
-            for ( i = j - 1; i >= 0; --i )
+            /* GMRES inner-loop */
+            for ( j = 0; j < control->cm_solver_restart && FABS( w[j] ) / bnorm > tol; j++ )
             {
-                Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
-            }
+                /* compute v_j */
+                t_start = Get_Time( );
+                Vector_Scale( z[j], -2 * u[j][j], u[j], N );
+                z[j][j] += 1.; /* due to e_j */
 
-            /* matvec */
-            Sparse_MatVec( H, z[j], v );
+                for ( i = j - 1; i >= 0; --i )
+                {
+                    Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i );
+                }
+                t_vops += Get_Timing_Info( t_start );
 
-            for ( k = 0; k < N; ++k )
-            {
-                v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
-            }
+                /* matvec */
+                t_start = Get_Time( );
+                Sparse_MatVec( H, z[j], workspace->b_prc );
+                t_spmv += Get_Timing_Info( t_start );
 
-            for ( i = 0; i <= j; ++i )
-            {
-                Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
-            }
+                t_start = Get_Time( );
+                apply_preconditioner( workspace, control, workspace->b_prc, v, fresh_pre );
+                t_pa += Get_Timing_Info( t_start );
 
-            if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
-            {
-                /* compute the HouseHolder unit vector u_j+1 */
+                t_start = Get_Time( );
                 for ( i = 0; i <= j; ++i )
                 {
-                    u[j + 1][i] = 0;
+                    Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i );
                 }
 
-                Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
+                if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) )
+                {
+                    /* compute the HouseHolder unit vector u_j+1 */
+                    Vector_MakeZero( u[j + 1], j + 1 );
+                    Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) );
+                    ret_temp = Norm( v + (j + 1), N - (j + 1) );
+#ifdef _OPENMP
+                    #pragma omp single
+#endif
+                    u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * ret_temp;
 
-                u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) );
+#ifdef _OPENMP
+                    #pragma omp barrier
+#endif
 
-                Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
+                    Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N );
 
-                /* overwrite v with P_m+1 * v */
-                v[j + 1] -= 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
-                Vector_MakeZero( v + (j + 2), N - (j + 2) );
-                // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
-            }
+                    /* overwrite v with P_m+1 * v */
+                    ret_temp = 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1];
+#ifdef _OPENMP
+                    #pragma omp single
+#endif
+                    v[j + 1] -= ret_temp;
 
+#ifdef _OPENMP
+                    #pragma omp barrier
+#endif
 
-            /* prev Givens rots on the upper-Hessenberg matrix to make it U */
-            for ( i = 0; i < j; i++ )
-            {
-                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
-                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
+                    Vector_MakeZero( v + (j + 2), N - (j + 2) );
+//                    Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
+                }
+                t_vops += Get_Timing_Info( t_start );
 
-                v[i]   = tmp1;
-                v[i + 1] = tmp2;
-            }
+                /* prev Givens rots on the upper-Hessenberg matrix to make it U */
+                t_start = Get_Time( );
+#ifdef _OPENMP
+                #pragma omp single
+#endif
+                {
+                    for ( i = 0; i < j; i++ )
+                    {
+                        tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1];
+                        tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1];
 
-            /* apply the new Givens rotation to H and right-hand side */
-            if ( FABS(v[j + 1]) >= ALMOST_ZERO )
-            {
-                cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
-                workspace->hc[j] = v[j] / cc;
-                workspace->hs[j] = v[j + 1] / cc;
+                        v[i]   = tmp1;
+                        v[i + 1] = tmp2;
+                    }
 
-                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
-                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
+                    /* apply the new Givens rotation to H and right-hand side */
+                    if ( FABS(v[j + 1]) >= ALMOST_ZERO )
+                    {
+                        cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) );
+                        workspace->hc[j] = v[j] / cc;
+                        workspace->hs[j] = v[j + 1] / cc;
 
-                v[j]   = tmp1;
-                v[j + 1] = tmp2;
+                        tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1];
+                        tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1];
 
-                /* Givens rotations to rhs */
-                tmp1 =  workspace->hc[j] * w[j];
-                tmp2 = -workspace->hs[j] * w[j];
-                w[j]   = tmp1;
-                w[j + 1] = tmp2;
-            }
+                        v[j]   = tmp1;
+                        v[j + 1] = tmp2;
 
-            /* extend R */
-            for ( i = 0; i <= j; ++i )
-            {
-                workspace->h[i][j] = v[i];
+                        /* Givens rotations to rhs */
+                        tmp1 =  workspace->hc[j] * w[j];
+                        tmp2 = -workspace->hs[j] * w[j];
+                        w[j]   = tmp1;
+                        w[j + 1] = tmp2;
+                    }
+
+                    /* extend R */
+                    for ( i = 0; i <= j; ++i )
+                    {
+                        workspace->h[i][j] = v[i];
+                    }
+                }
+                t_ortho += Get_Timing_Info( t_start );
             }
 
 
-            // fprintf( stderr, "h:" );
-            // for( i = 0; i <= j+1 ; ++i )
-            // fprintf( stderr, "%.6f ", h[i][j] );
-            // fprintf( stderr, "\n" );
-            // fprintf( stderr, "%12.6f\n", w[j+1] );
-        }
+            /* solve Hy = w.
+               H is now upper-triangular, do back-substitution */
+            t_start = Get_Time( );
+#ifdef _OPENMP
+            #pragma omp single
+#endif
+            {
+                for ( i = j - 1; i >= 0; i-- )
+                {
+                    temp = w[i];
+                    for ( k = j - 1; k > i; k-- )
+                    {
+                        temp -= workspace->h[i][k] * workspace->y[k];
+                    }
 
+                    workspace->y[i] = temp / workspace->h[i][i];
+                }
+            }
+            t_ts += Get_Timing_Info( t_start );
 
-        /* solve Hy = w.
-           H is now upper-triangular, do back-substitution */
-        for ( i = j - 1; i >= 0; i-- )
-        {
-            temp = w[i];
-            for ( k = j - 1; k > i; k-- )
+            t_start = Get_Time( );
+            for ( i = j - 1; i >= 0; i-- )
             {
-                temp -= workspace->h[i][k] * workspace->y[k];
+                Vector_Add( x, workspace->y[i], z[i], N );
             }
+            t_vops += Get_Timing_Info( t_start );
 
-            workspace->y[i] = temp / workspace->h[i][i];
-        }
-
-        // fprintf( stderr, "y: " );
-        // for( i = 0; i < control->cm_solver_restart+1; ++i )
-        //   fprintf( stderr, "%8.3f ", workspace->y[i] );
-
-
-        /* update x = x_0 + Vy */
-        // memset( z, 0, sizeof(real) * N );
-        // for( i = j-1; i >= 0; i-- )
-        //   {
-        //     Vector_Copy( v, z, N );
-        //     v[i] += workspace->y[i];
-        //
-        //     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
-        //   }
-        //
-        // fprintf( stderr, "\nz: " );
-        // for( k = 0; k < N; ++k )
-        // fprintf( stderr, "%6.2f ", z[k] );
-
-        // fprintf( stderr, "\nx_bef: " );
-        // for( i = 0; i < N; ++i )
-        //   fprintf( stderr, "%6.2f ", x[i] );
-
-        // Vector_Add( x, 1, z, N );
-        for ( i = j - 1; i >= 0; i-- )
-        {
-            Vector_Add( x, workspace->y[i], z[i], N );
+            /* stopping condition */
+            if ( FABS( w[j] ) / bnorm <= tol )
+            {
+                break;
+            }
         }
 
-        /* stopping condition */
-        if ( FABS( w[j] ) / bnorm <= tol )
+#ifdef _OPENMP
+        #pragma omp single
+#endif
         {
-            break;
+            g_j = j;
+            g_itr = itr;
         }
     }
 
-    if ( itr >= control->cm_solver_max_iters )
+#ifdef _OPENMP
+    data->timing.cm_solver_orthog += t_ortho / control->num_threads;
+    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
+    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
+    data->timing.cm_solver_tri_solve += t_ts / control->num_threads;
+    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
+#else
+    data->timing.cm_solver_orthog += t_ortho;
+    data->timing.cm_solver_pre_app += t_pa;
+    data->timing.cm_solver_spmv += t_spmv;
+    data->timing.cm_solver_tri_solve += t_ts;
+    data->timing.cm_solver_vector_ops += t_vops;
+#endif
+
+    if ( g_itr >= control->cm_solver_max_iters )
     {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return itr * (control->cm_solver_restart + 1) + j + 1;
+        fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr );
+        return g_itr * (control->cm_solver_restart + 1) + j + 1;
     }
 
-    return itr * (control->cm_solver_restart + 1) + j + 1;
+    return g_itr * (control->cm_solver_restart + 1) + g_j + 1;
 }
 
 
 /* Conjugate Gradient */
 int CG( const static_storage * const workspace, const control_params * const control,
-        const sparse_matrix * const H, const real * const b, const real tol,
-        real * const x, const int fresh_pre )
+        simulation_data * const data, const sparse_matrix * const H, const real * const b,
+        const real tol, real * const x, const int fresh_pre )
 {
-    int i, itr, N;
+    int i, g_itr, N;
     real tmp, alpha, beta, b_norm, r_norm;
     real *d, *r, *p, *z;
     real sig_old, sig_new;
+    real t_start, t_pa, t_spmv, t_vops;
 
     N = H->n;
     d = workspace->d;
@@ -3508,86 +3519,140 @@ int CG( const static_storage * const workspace, const control_params * const con
     z = workspace->p;
 
 #ifdef _OPENMP
-    #pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \
-    shared(itr, N, d, r, p, z)
+    #pragma omp parallel default(none) \
+    private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new, t_start) \
+    reduction(+: t_pa, t_spmv, t_vops) \
+    shared(g_itr, N, d, r, p, z)
 #endif
     {
+        t_pa = 0.0;
+        t_spmv = 0.0;
+        t_vops = 0.0;
+
+        t_start = Get_Time( );
         b_norm = Norm( b, N );
+        t_vops += Get_Timing_Info( t_start );
 
+        t_start = Get_Time( );
         Sparse_MatVec( H, x, d );
+        t_spmv += Get_Timing_Info( t_start );
+
+        t_start = Get_Time( );
         Vector_Sum( r, 1.0,  b, -1.0, d, N );
         r_norm = Norm( r, N );
+        t_vops += Get_Timing_Info( t_start );
 
+        t_start = Get_Time( );
         apply_preconditioner( workspace, control, r, z, fresh_pre );
-        Vector_Copy( p, z, N );
+        t_pa += Get_Timing_Info( t_start );
 
+        t_start = Get_Time( );
+        Vector_Copy( p, z, N );
         sig_new = Dot( r, z, N );
+        t_vops += Get_Timing_Info( t_start );
 
         for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i )
         {
+            t_start = Get_Time( );
             Sparse_MatVec( H, p, d );
+            t_spmv += Get_Timing_Info( t_start );
 
+            t_start = Get_Time( );
             tmp = Dot( d, p, N );
             alpha = sig_new / tmp;
             Vector_Add( x, alpha, p, N );
-
             Vector_Add( r, -alpha, d, N );
             r_norm = Norm( r, N );
+            t_vops += Get_Timing_Info( t_start );
 
+            t_start = Get_Time( );
             apply_preconditioner( workspace, control, r, z, FALSE );
+            t_pa += Get_Timing_Info( t_start );
 
+            t_start = Get_Time( );
             sig_old = sig_new;
             sig_new = Dot( r, z, N );
-
             beta = sig_new / sig_old;
             Vector_Sum( p, 1., z, beta, p, N );
+            t_vops += Get_Timing_Info( t_start );
         }
 
 #ifdef _OPENMP
         #pragma omp single
 #endif
-        itr = i;
+        g_itr = i;
     }
 
-    if ( itr >= control->cm_solver_max_iters )
+#ifdef _OPENMP
+    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
+    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
+    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
+#else
+    data->timing.cm_solver_pre_app += t_pa;
+    data->timing.cm_solver_spmv += t_spmv;
+    data->timing.cm_solver_vector_ops += t_vops;
+#endif
+
+    if ( g_itr >= control->cm_solver_max_iters )
     {
-        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", itr );
-        return itr;
+        fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", g_itr );
+        return g_itr;
     }
 
-    return itr;
+    return g_itr;
 }
 
 
 /* Steepest Descent */
 int SDM( const static_storage * const workspace, const control_params * const control,
-         const sparse_matrix * const H, const real * const b, const real tol,
-         real * const x, const int fresh_pre )
+         simulation_data * const data, const sparse_matrix * const H, const real * const b,
+         const real tol, real * const x, const int fresh_pre )
 {
-    int i, itr, N;
+    int i, g_itr, N;
     real tmp, alpha, b_norm;
     real sig;
+    real t_start, t_pa, t_spmv, t_vops;
 
     N = H->n;
 
 #ifdef _OPENMP
-    #pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \
-    shared(itr, N)
+    #pragma omp parallel default(none) \
+    private(i, tmp, alpha, b_norm, sig, t_start) \
+    reduction(+: t_pa, t_spmv, t_vops) \
+    shared(g_itr, N)
 #endif
     {
+        t_pa = 0.0;
+        t_spmv = 0.0;
+        t_vops = 0.0;
+
+        t_start = Get_Time( );
         b_norm = Norm( b, N );
+        t_vops += Get_Timing_Info( t_start );
 
+        t_start = Get_Time( );
         Sparse_MatVec( H, x, workspace->q );
+        t_spmv += Get_Timing_Info( t_start );
+
+        t_start = Get_Time( );
         Vector_Sum( workspace->r, 1.0,  b, -1.0, workspace->q, N );
+        t_vops += Get_Timing_Info( t_start );
 
+        t_start = Get_Time( );
         apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre );
+        t_pa += Get_Timing_Info( t_start );
 
+        t_start = Get_Time( );
         sig = Dot( workspace->r, workspace->d, N );
+        t_vops += Get_Timing_Info( t_start );
 
         for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i )
         {
+            t_start = Get_Time( );
             Sparse_MatVec( H, workspace->d, workspace->q );
+            t_spmv += Get_Timing_Info( t_start );
 
+            t_start = Get_Time( );
             sig = Dot( workspace->r, workspace->d, N );
 
             /* ensure each thread gets a local copy of
@@ -3603,23 +3668,36 @@ int SDM( const static_storage * const workspace, const control_params * const co
 
             Vector_Add( x, alpha, workspace->d, N );
             Vector_Add( workspace->r, -alpha, workspace->q, N );
+            t_vops += Get_Timing_Info( t_start );
 
+            t_start = Get_Time( );
             apply_preconditioner( workspace, control, workspace->r, workspace->d, FALSE );
+            t_pa += Get_Timing_Info( t_start );
         }
 
 #ifdef _OPENMP
         #pragma omp single
 #endif
-        itr = i;
+        g_itr = i;
     }
 
-    if ( itr >= control->cm_solver_max_iters  )
+#ifdef _OPENMP
+    data->timing.cm_solver_pre_app += t_pa / control->num_threads;
+    data->timing.cm_solver_spmv += t_spmv / control->num_threads;
+    data->timing.cm_solver_vector_ops += t_vops / control->num_threads;
+#else
+    data->timing.cm_solver_pre_app += t_pa;
+    data->timing.cm_solver_spmv += t_spmv;
+    data->timing.cm_solver_vector_ops += t_vops;
+#endif
+
+    if ( g_itr >= control->cm_solver_max_iters  )
     {
-        fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", itr );
-        return itr;
+        fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", g_itr );
+        return g_itr;
     }
 
-    return itr;
+    return g_itr;
 }
 
 
@@ -3639,11 +3717,7 @@ real condest( const sparse_matrix * const L, const sparse_matrix * const U )
 
     N = L->n;
 
-    if ( (e = (real*) malloc(sizeof(real) * N)) == NULL )
-    {
-        fprintf( stderr, "Not enough memory for condest. Terminating.\n" );
-        exit( INSUFFICIENT_MEMORY );
-    }
+    e = (real*) smalloc( sizeof(real) * N, "condest::e" );
 
     memset( e, 1., N * sizeof(real) );
 
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index 6caf7a1a052decd8032eba19e56e8010b0939b42..814701fe06c81a4fcd33530f9a16f70c12abdf5d 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -32,8 +32,8 @@ typedef enum
 
 void Sort_Matrix_Rows( sparse_matrix * const );
 
-void Setup_Sparsity_Pattern( const sparse_matrix * const, 
-        const real, sparse_matrix ** );
+void setup_sparse_approx_inverse( const sparse_matrix * const, sparse_matrix **, 
+        sparse_matrix **, sparse_matrix **, sparse_matrix **, const real );
 
 int Estimate_LU_Fill( const sparse_matrix * const, const real * const );
 
@@ -62,7 +62,7 @@ real ILUT_PAR( const sparse_matrix * const, const real *,
         const unsigned int, sparse_matrix * const, sparse_matrix * const );
 
 #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL)
-real Sparse_Approx_Inverse( const sparse_matrix * const, const sparse_matrix * const,
+real sparse_approx_inverse( const sparse_matrix * const, const sparse_matrix * const,
         sparse_matrix ** );
 #endif
 
@@ -94,11 +94,11 @@ int GMRES_HouseHolder( const static_storage * const, const control_params * cons
         const int );
 
 int CG( const static_storage * const, const control_params * const,
-        const sparse_matrix * const, const real * const, const real,
-        real * const, const int );
+        simulation_data * const, const sparse_matrix * const, const real * const,
+        const real, real * const, const int );
 
 int SDM( const static_storage * const, const control_params * const,
-        const sparse_matrix * const, const real * const, const real,
+        simulation_data * const, const sparse_matrix * const, const real * const, const real,
         real * const, const int );
 
 real condest( const sparse_matrix * const, const sparse_matrix * const );
diff --git a/sPuReMD/src/lookup.c b/sPuReMD/src/lookup.c
index 7ae8ab2e3ac083febd400ae34e18d5ddb2c87b80..7ec372f4392fea03e81219eb2780c64ab45b7e4c 100644
--- a/sPuReMD/src/lookup.c
+++ b/sPuReMD/src/lookup.c
@@ -36,7 +36,8 @@ void Make_Lookup_Table( real xmin, real xmax, int n,
     t->dx = (xmax - xmin) / (n - 1);
     t->inv_dx = 1.0 / t->dx;
     t->a = (n - 1) / (xmax - xmin);
-    t->y = (real*) malloc(n * sizeof(real));
+    t->y = (real*) smalloc( n * sizeof(real),
+            "Make_Lookup_Table::t->y" );
 
     for (i = 0; i < n; i++)
     {
@@ -83,11 +84,16 @@ void Natural_Cubic_Spline( const real *h, const real *f,
     real *a, *b, *c, *d, *v;
 
     /* allocate space for the linear system */
-    a = (real*) malloc( n * sizeof(real) );
-    b = (real*) malloc( n * sizeof(real) );
-    c = (real*) malloc( n * sizeof(real) );
-    d = (real*) malloc( n * sizeof(real) );
-    v = (real*) malloc( n * sizeof(real) );
+    a = (real*) smalloc( n * sizeof(real),
+           "Natural_Cubic_Spline::a" );
+    b = (real*) smalloc( n * sizeof(real),
+           "Natural_Cubic_Spline::b" );
+    c = (real*) smalloc( n * sizeof(real),
+           "Natural_Cubic_Spline::c" );
+    d = (real*) smalloc( n * sizeof(real),
+           "Natural_Cubic_Spline::d" );
+    v = (real*) smalloc( n * sizeof(real),
+           "Natural_Cubic_Spline::v" );
 
     /* build the linear system */
     a[0] = 0.0;
@@ -153,11 +159,16 @@ void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
     real *a, *b, *c, *d, *v;
 
     /* allocate space for the linear system */
-    a = (real*) malloc( n * sizeof(real) );
-    b = (real*) malloc( n * sizeof(real) );
-    c = (real*) malloc( n * sizeof(real) );
-    d = (real*) malloc( n * sizeof(real) );
-    v = (real*) malloc( n * sizeof(real) );
+    a = (real*) smalloc( n * sizeof(real),
+           "Complete_Cubic_Spline::a" );
+    b = (real*) smalloc( n * sizeof(real),
+           "Complete_Cubic_Spline::b" );
+    c = (real*) smalloc( n * sizeof(real),
+           "Complete_Cubic_Spline::c" );
+    d = (real*) smalloc( n * sizeof(real),
+           "Complete_Cubic_Spline::d" );
+    v = (real*) smalloc( n * sizeof(real),
+           "Complete_Cubic_Spline::v" );
 
     /* build the linear system */
     a[0] = 0.0;
@@ -260,19 +271,27 @@ void Make_LR_Lookup_Table( reax_system *system, control_params *control )
 
     num_atom_types = system->reaxprm.num_atom_types;
     dr = control->r_cut / control->tabulate;
-    h = (real*) calloc( (control->tabulate + 1), sizeof(real) );
-    fh = (real*) calloc( (control->tabulate + 1), sizeof(real) );
-    fvdw = (real*) calloc( (control->tabulate + 1), sizeof(real) );
-    fCEvd = (real*) calloc( (control->tabulate + 1), sizeof(real) );
-    fele = (real*) calloc( (control->tabulate + 1), sizeof(real) );
-    fCEclmb = (real*) calloc( (control->tabulate + 1), sizeof(real) );
+    h = (real*) scalloc( (control->tabulate + 1), sizeof(real),
+            "Make_LR_Lookup_Table::h" );
+    fh = (real*) scalloc( (control->tabulate + 1), sizeof(real),
+            "Make_LR_Lookup_Table::fh" );
+    fvdw = (real*) scalloc( (control->tabulate + 1), sizeof(real),
+            "Make_LR_Lookup_Table::fvdw" );
+    fCEvd = (real*) scalloc( (control->tabulate + 1), sizeof(real),
+            "Make_LR_Lookup_Table::fCEvd" );
+    fele = (real*) scalloc( (control->tabulate + 1), sizeof(real),
+            "Make_LR_Lookup_Table::fele" );
+    fCEclmb = (real*) scalloc( (control->tabulate + 1), sizeof(real),
+            "Make_LR_Lookup_Table::fCEclmb" );
 
     /* allocate Long-Range LookUp Table space based on
        number of atom types in the ffield file */
-    LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
+    LR = (LR_lookup_table**) smalloc( num_atom_types * sizeof(LR_lookup_table*),
+           "Make_LR_Lookup_Table::LR" );
     for ( i = 0; i < num_atom_types; ++i )
     {
-        LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
+        LR[i] = (LR_lookup_table*) smalloc( num_atom_types * sizeof(LR_lookup_table),
+                "Make_LR_Lookup_Table::LR[i]");
     }
 
     /* most atom types in ffield file will not exist in the current
@@ -303,17 +322,23 @@ void Make_LR_Lookup_Table( reax_system *system, control_params *control )
                     LR[i][j].dx = dr;
                     LR[i][j].inv_dx = control->tabulate / control->r_cut;
                     LR[i][j].y = (LR_data*)
-                        malloc( LR[i][j].n * sizeof(LR_data) );
+                        smalloc( LR[i][j].n * sizeof(LR_data),
+                              "Make_LR_Lookup_Table::LR[i][j].y" );
                     LR[i][j].H = (cubic_spline_coef*)
-                        malloc( LR[i][j].n * sizeof(cubic_spline_coef) );
+                        smalloc( LR[i][j].n * sizeof(cubic_spline_coef),
+                              "Make_LR_Lookup_Table::LR[i][j].H" );
                     LR[i][j].vdW = (cubic_spline_coef*)
-                        malloc( LR[i][j].n * sizeof(cubic_spline_coef) );
+                        smalloc( LR[i][j].n * sizeof(cubic_spline_coef),
+                              "Make_LR_Lookup_Table::LR[i][j].vdW" );
                     LR[i][j].CEvd = (cubic_spline_coef*)
-                        malloc( LR[i][j].n * sizeof(cubic_spline_coef) );
+                        smalloc( LR[i][j].n * sizeof(cubic_spline_coef),
+                              "Make_LR_Lookup_Table::LR[i][j].CEvd" );
                     LR[i][j].ele = (cubic_spline_coef*)
-                        malloc( LR[i][j].n * sizeof(cubic_spline_coef) );
+                        smalloc( LR[i][j].n * sizeof(cubic_spline_coef),
+                              "Make_LR_Lookup_Table::LR[i][j].ele" );
                     LR[i][j].CEclmb = (cubic_spline_coef*)
-                        malloc( LR[i][j].n * sizeof(cubic_spline_coef) );
+                        smalloc( LR[i][j].n * sizeof(cubic_spline_coef),
+                              "Make_LR_Lookup_Table::LR[i][j].CEclmb" );
 
                     for ( r = 1; r <= control->tabulate; ++r )
                     {
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index 51f24d401b1337a9d8059df252d51864114c7106..023542b9408131135470014fd0770c180687e16a 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -890,8 +890,10 @@ typedef struct
 
     /* charge method storage */
     sparse_matrix *H;
+    sparse_matrix *H_full;
     sparse_matrix *H_sp;
     sparse_matrix *H_spar_patt;
+    sparse_matrix *H_spar_patt_full;
     sparse_matrix *H_app_inv;
     sparse_matrix *L;
     sparse_matrix *U;
diff --git a/sPuReMD/src/restart.c b/sPuReMD/src/restart.c
index 0f7b1c5bdcbacdb134f6ab4acd944fb8f5ef109d..82474a8b96bbf39b0abf24f3e2f89f754f21e98e 100644
--- a/sPuReMD/src/restart.c
+++ b/sPuReMD/src/restart.c
@@ -20,7 +20,9 @@
   ----------------------------------------------------------------------*/
 
 #include "restart.h"
+
 #include "box.h"
+#include "tool_box.h"
 #include "vector.h"
 
 
@@ -101,17 +103,25 @@ void Read_Binary_Restart( char *fname, reax_system *system,
 #endif
 
     /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+    system->atoms = (reax_atom*) scalloc( system->N, sizeof(reax_atom),
+            "Read_Binary_Restart::system->atoms" );
 
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
+    workspace->map_serials = (int*) scalloc( MAX_ATOM_ID, sizeof(int),
+            "Read_Binary_Restart::workspace->map_serials" );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
         workspace->map_serials[i] = -1;
 
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int**) calloc( system->N, sizeof(int*) );
+    workspace->orig_id = (int*) scalloc( system->N, sizeof(int),
+            "Read_Binary_Restart::workspace->orig_id" );
+    workspace->restricted  = (int*) scalloc( system->N, sizeof(int),
+            "Read_Binary_Restart::workspace->restricted" );
+    workspace->restricted_list = (int**) scalloc( system->N, sizeof(int*),
+            "Read_Binary_Restart::workspace->restricted_list" );
     for ( i = 0; i < system->N; ++i )
-        workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
+    {
+        workspace->restricted_list[i] = (int*) scalloc( MAX_RESTRICT, sizeof(int),
+                "Read_Binary_Restart::workspace->restricted_list[i]" );
+    }
 
     for ( i = 0; i < system->N; ++i )
     {
@@ -207,17 +217,27 @@ void Read_ASCII_Restart( char *fname, reax_system *system,
 #endif
 
     /* memory allocations for atoms, atom maps, bond restrictions */
-    system->atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+    system->atoms = (reax_atom*) scalloc( system->N, sizeof(reax_atom),
+            "Read_ASCII_Restart::system->atoms" );
 
-    workspace->map_serials = (int*) calloc( MAX_ATOM_ID, sizeof(int) );
+    workspace->map_serials = (int*) scalloc( MAX_ATOM_ID, sizeof(int),
+            "Read_ASCII_Restart::workspace->map_serials" );
     for ( i = 0; i < MAX_ATOM_ID; ++i )
+    {
         workspace->map_serials[i] = -1;
+    }
 
-    workspace->orig_id = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted  = (int*) calloc( system->N, sizeof(int) );
-    workspace->restricted_list = (int**) calloc( system->N, sizeof(int*) );
+    workspace->orig_id = (int*) scalloc( system->N, sizeof(int),
+            "Read_ASCII_Restart::workspace->orig_id" );
+    workspace->restricted  = (int*) scalloc( system->N, sizeof(int),
+            "Read_ASCII_Restart::workspace->restricted" );
+    workspace->restricted_list = (int**) scalloc( system->N, sizeof(int*),
+            "Read_ASCII_Restart::workspace->restricted_list" );
     for ( i = 0; i < system->N; ++i )
-        workspace->restricted_list[i] = (int*) calloc( MAX_RESTRICT, sizeof(int) );
+    {
+        workspace->restricted_list[i] = (int*) scalloc( MAX_RESTRICT, sizeof(int),
+                "Read_ASCII_Restart::workspace->restricted_list[i]" );
+    }
 
     for ( i = 0; i < system->N; ++i )
     {
diff --git a/sPuReMD/src/spuremd.c b/sPuReMD/src/spuremd.c
index 2e791a306ed29c668573aa52e7bd078f9be181f4..82958abe94f9e1a7432e4752a1fee9b823b1aabc 100644
--- a/sPuReMD/src/spuremd.c
+++ b/sPuReMD/src/spuremd.c
@@ -154,7 +154,8 @@ void static Read_System( char * const geo_file,
 int Setup( char ** args, reax_system * const system, control_params * const control,
         simulation_data * const data )
 {
-    lists = (reax_list*) malloc( sizeof(reax_list) * LIST_N );
+    lists = (reax_list*) smalloc( sizeof(reax_list) * LIST_N,
+           "Setup::lists" );
 
     Read_System( args[0], args[1], args[2], system, control,
             data, &workspace, &out_control );
diff --git a/sPuReMD/src/tool_box.c b/sPuReMD/src/tool_box.c
index c37ccc3032d419e69035402fab82b8da2df16821..6b59d19446c9cbaaed604731d0cafdb3e163779e 100644
--- a/sPuReMD/src/tool_box.c
+++ b/sPuReMD/src/tool_box.c
@@ -352,34 +352,22 @@ char *Get_Atom_Name( reax_system *system, int i )
 }
 
 
-int Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
+void Allocate_Tokenizer_Space( char **line, char **backup, char ***tokens )
 {
     int i;
 
-    if ( (*line = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
-    {
-        return FAILURE;
-    }
-
-    if ( (*backup = (char*) malloc( sizeof(char) * MAX_LINE )) == NULL )
-    {
-        return FAILURE;
-    }
-
-    if ( (*tokens = (char**) malloc( sizeof(char*) * MAX_TOKENS )) == NULL )
-    {
-        return FAILURE;
-    }
+    *line = (char*) smalloc( sizeof(char) * MAX_LINE,
+            "Allocate_Tokenizer_Space::*line" );
+    *backup = (char*) smalloc( sizeof(char) * MAX_LINE,
+            "Allocate_Tokenizer_Space::*backup" );
+    *tokens = (char**) smalloc( sizeof(char*) * MAX_TOKENS,
+            "Allocate_Tokenizer_Space::*tokens" );
 
     for ( i = 0; i < MAX_TOKENS; i++ )
     {
-        if ( ((*tokens)[i] = (char*) malloc(sizeof(char) * MAX_TOKEN_LEN)) == NULL )
-        {
-            return FAILURE;
-        }
+        (*tokens)[i] = (char*) smalloc(sizeof(char) * MAX_TOKEN_LEN,
+                "Allocate_Tokenizer_Space::(*tokens)[i]" );
     }
-
-    return SUCCESS;
 }
 
 
diff --git a/sPuReMD/src/tool_box.h b/sPuReMD/src/tool_box.h
index ba103f9aa71281b4ae62ba41979240b5098ecf60..d1b15f64bd2a8adb7ee3a0ee7c40d40113b81f48 100644
--- a/sPuReMD/src/tool_box.h
+++ b/sPuReMD/src/tool_box.h
@@ -72,7 +72,7 @@ char *Get_Element( reax_system*, int );
 
 char *Get_Atom_Name( reax_system*, int );
 
-int Allocate_Tokenizer_Space( char**, char**, char*** );
+void Allocate_Tokenizer_Space( char**, char**, char*** );
 
 void Deallocate_Tokenizer_Space( char **, char **, char *** );