diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c
index b4387244785fb56c1869d1f5937ba0c247ad6cde..c08019ff0d4f6d6b000782a8a59163d6ee81fea6 100644
--- a/sPuReMD/src/GMRES.c
+++ b/sPuReMD/src/GMRES.c
@@ -74,7 +74,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
         #pragma omp barrier
 
 #endif
-        #pragma omp for schedule(guided)
+        #pragma omp for schedule(static)
         for ( i = 0; i < n; ++i )
         {
             si = A->start[i];
@@ -101,7 +101,7 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 #endif
         }
 #ifdef _OPENMP
-        #pragma omp for schedule(guided)
+        #pragma omp for schedule(static)
         for ( i = 0; i < n; ++i )
         {
             for ( j = 0; j < omp_get_num_threads(); ++j )
@@ -120,8 +120,8 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y,
 {
     unsigned int i;
 
-    #pragma omp parallel for schedule(guided) \
-    default(none) private(i)
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i)
     for ( i = 0; i < N; ++i )
     {
         x[i] = y[i] * Hdia_inv[i];
@@ -198,6 +198,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
     static int levels_L = 1, levels_U = 1;
     static unsigned int *row_levels_L = NULL, *level_rows_L = NULL, *level_rows_cnt_L = NULL;
     static unsigned int *row_levels_U = NULL, *level_rows_U = NULL, *level_rows_cnt_U = NULL;
+    static unsigned int *top = NULL;
     unsigned int *row_levels, *level_rows, *level_rows_cnt;
 
     if ( tri == LOWER )
@@ -218,8 +219,17 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
     if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL )
     {
         if ( (row_levels = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
-                || (level_rows = (unsigned int*) malloc(MAX_ROWS_PER_LEVEL * (size_t)LU->n * sizeof(unsigned int))) == NULL
-                || (level_rows_cnt = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL )
+                || (level_rows = (unsigned int*) malloc((size_t)LU->n * sizeof(unsigned int))) == NULL
+                || (level_rows_cnt = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
+        {
+            fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
+            exit( INSUFFICIENT_MEMORY );
+        }
+    }
+
+    if ( top == NULL )
+    {
+        if ( (top = (unsigned int*) malloc((size_t)(LU->n + 1) * sizeof(unsigned int))) == NULL )
         {
             fprintf( stderr, "Not enough space for triangular solve via level scheduling. Terminating...\n" );
             exit( INSUFFICIENT_MEMORY );
@@ -229,52 +239,57 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
     /* find levels (row dependencies in substitutions) */
     if ( find_levels )
     {
-        memset( row_levels, 0, LU->n * sizeof( unsigned int) );
-        memset( level_rows_cnt, 0, LU->n * sizeof( unsigned int) );
+        memset( row_levels, 0, LU->n * sizeof(unsigned int) );
+        memset( level_rows_cnt, 0, LU->n * sizeof(unsigned int) );
+        memset( top, 0, LU->n * sizeof(unsigned int) );
 
         if ( tri == LOWER )
         {
             for ( i = 0; i < LU->n; ++i )
             {
-                local_level = 0;
+                local_level = 1;
                 for ( pj = LU->start[i]; pj < LU->start[i + 1] - 1; ++pj )
                 {
                     local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                 }
 
-                levels = MAX( levels, local_level + 1 );
+                levels = MAX( levels, local_level );
                 row_levels[i] = local_level;
-                level_rows[local_level * MAX_ROWS_PER_LEVEL + level_rows_cnt[local_level]] = i;
                 ++level_rows_cnt[local_level];
-                if ( level_rows_cnt[local_level] >= MAX_ROWS_PER_LEVEL )
-                {
-                    fprintf( stderr, "Not enough space for triangular solve via level scheduling" );
-                    fprintf( stderr, " (MAX_ROWS_PER_LEVEL). Terminating...\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
             }
+
+	    printf("levels(L): %d\n", levels);
+	    printf("NNZ(L): %d\n", LU->start[LU->n]);
         }
         else
         {
             for ( i = LU->n - 1; i >= 0; --i )
             {
-                local_level = 0;
+                local_level = 1;
                 for ( pj = LU->start[i] + 1; pj < LU->start[i + 1]; ++pj )
                 {
                     local_level = MAX( local_level, row_levels[LU->j[pj]] + 1 );
                 }
 
-                levels = MAX( levels, local_level + 1 );
+                levels = MAX( levels, local_level );
                 row_levels[i] = local_level;
-                level_rows[local_level * MAX_ROWS_PER_LEVEL + level_rows_cnt[local_level]] = i;
                 ++level_rows_cnt[local_level];
-                if ( level_rows_cnt[local_level] >= MAX_ROWS_PER_LEVEL )
-                {
-                    fprintf( stderr, "Not enough space for triangular solve via level scheduling" );
-                    fprintf( stderr, " (MAX_ROWS_PER_LEVEL). Terminating...\n" );
-                    exit( INSUFFICIENT_MEMORY );
-                }
             }
+
+	    printf("levels(U): %d\n", levels);
+	    printf("NNZ(U): %d\n", LU->start[LU->n]);
+        }
+
+        for ( i = 1; i < levels + 1; ++i )
+        {
+            level_rows_cnt[i] += level_rows_cnt[i - 1];
+            top[i] = level_rows_cnt[i];
+        }
+
+        for ( i = 0; i < LU->n; ++i )
+        {
+            level_rows[top[row_levels[i] - 1]] = i;
+	    ++top[row_levels[i] - 1];
         }
     }
 
@@ -286,9 +301,9 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
             for ( i = 0; i < levels; ++i )
             {
                 #pragma omp for schedule(static)
-                for ( j = 0; j < level_rows_cnt[i]; ++j )
+                for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
                 {
-                    local_row = level_rows[i * MAX_ROWS_PER_LEVEL + j];
+                    local_row = level_rows[j];
                     x[local_row] = y[local_row];
                     for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj )
                     {
@@ -304,9 +319,9 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real *
             for ( i = 0; i < levels; ++i )
             {
                 #pragma omp for schedule(static)
-                for ( j = 0; j < level_rows_cnt[i]; ++j )
+                for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j )
                 {
-                    local_row = level_rows[i * MAX_ROWS_PER_LEVEL + j];
+                    local_row = level_rows[j];
                     x[local_row] = y[local_row];
                     for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj )
                     {
@@ -411,7 +426,7 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
         #pragma omp barrier
 
         /* precompute and cache, as invariant in loop below */
-        #pragma omp for schedule(guided)
+        #pragma omp for schedule(static)
         for ( i = 0; i < R->n; ++i )
         {
             Dinv_b[i] = Dinv[i] * b[i];
@@ -432,7 +447,7 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
 
             #pragma omp barrier
 
-            #pragma omp for schedule(guided)
+            #pragma omp for schedule(static)
             for ( i = 0; i < R->n; ++i )
             {
                 if (tri == LOWER)
@@ -464,7 +479,7 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
 
             #pragma omp barrier
 
-            #pragma omp for schedule(guided)
+            #pragma omp for schedule(static)
             for ( i = 0; i < R->n; ++i )
             {
 #ifdef _OPENMP
diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c
index 5c5b47791b965846fddded0fa24e82b8c09ccb9c..514330bdb342caa65b3446f10cd463cabca75b43 100644
--- a/sPuReMD/src/QEq.c
+++ b/sPuReMD/src/QEq.c
@@ -198,7 +198,7 @@ static void Calculate_Droptol( const sparse_matrix * const A, real * const dropt
         #pragma omp barrier
 
         /* calculate sqaure of the norm of each row */
-        #pragma omp for schedule(guided)
+        #pragma omp for schedule(static)
         for ( i = 0; i < A->n; ++i )
         {
             for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k )
@@ -226,7 +226,7 @@ static void Calculate_Droptol( const sparse_matrix * const A, real * const dropt
         #pragma omp barrier
 
 #ifdef _OPENMP
-        #pragma omp for schedule(guided)
+        #pragma omp for schedule(static)
         for ( i = 0; i < A->n; ++i )
         {
             droptol[i] = 0.0;
@@ -241,7 +241,7 @@ static void Calculate_Droptol( const sparse_matrix * const A, real * const dropt
 
         /* calculate local droptol for each row */
         //fprintf( stderr, "droptol: " );
-        #pragma omp for schedule(guided)
+        #pragma omp for schedule(static)
         for ( i = 0; i < A->n; ++i )
         {
             //fprintf( stderr, "%f-->", droptol[i] );
@@ -261,8 +261,8 @@ static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const d
 
     fillin = 0;
 
-    #pragma omp parallel for schedule(guided) \
-    default(none) private(i, j, pj, val) reduction(+: fillin)
+    #pragma omp parallel for schedule(static) \
+        default(none) private(i, j, pj, val) reduction(+: fillin)
     for ( i = 0; i < A->n; ++i )
     {
         for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj )
@@ -587,7 +587,7 @@ static real diag_pre_comp( const reax_system * const system, real * const Hdia_i
 
     start = Get_Time( );
 
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) private(i)
     for ( i = 0; i < system->N; ++i )
     {
@@ -811,7 +811,7 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     for ( i = 0; i < sweeps; ++i )
     {
         /* for each nonzero */
-        #pragma omp parallel for schedule(guided) \
+        #pragma omp parallel for schedule(static) \
         default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y)
         for ( j = 0; j < A->start[A->n]; ++j )
         {
@@ -971,7 +971,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
         exit( INSUFFICIENT_MEMORY );
     }
 
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) shared(D, D_inv) private(i)
     for ( i = 0; i < A->n; ++i )
     {
@@ -982,7 +982,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     /* to get convergence, A must have unit diagonal, so apply
      * transformation DAD, where D = D(1./sqrt(D(A))) */
     memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) shared(DAD, D) private(i, pj)
     for ( i = 0; i < A->n; ++i )
     {
@@ -1008,8 +1008,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     memcpy( U->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
 
     /* L has unit diagonal, by convention */
-    #pragma omp parallel for schedule(guided) \
-    default(none) private(i)
+    #pragma omp parallel for schedule(static) default(none) private(i)
     for ( i = 0; i < A->n; ++i )
     {
         L->val[L->start[i + 1] - 1] = 1.0;
@@ -1018,8 +1017,8 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     for ( i = 0; i < sweeps; ++i )
     {
         /* for each nonzero in L */
-        #pragma omp parallel for schedule(guided) \
-        default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
+        #pragma omp parallel for schedule(static) \
+            default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
         for ( j = 0; j < DAD->start[DAD->n]; ++j )
         {
             sum = ZERO;
@@ -1068,7 +1067,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
             }
         }
 
-        #pragma omp parallel for schedule(guided) \
+        #pragma omp parallel for schedule(static) \
         default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum)
         for ( j = 0; j < DAD->start[DAD->n]; ++j )
         {
@@ -1119,7 +1118,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps,
     /* apply inverse transformation:
      * since DAD \approx LU, then
      * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) shared(DAD, D_inv) private(i, pj)
     for ( i = 0; i < DAD->n; ++i )
     {
@@ -1181,7 +1180,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
         exit( INSUFFICIENT_MEMORY );
     }
 
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) shared(D, D_inv) private(i)
     for ( i = 0; i < A->n; ++i )
     {
@@ -1192,7 +1191,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
     /* to get convergence, A must have unit diagonal, so apply
      * transformation DAD, where D = D(1./sqrt(D(A))) */
     memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) );
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) shared(DAD, D) private(i, pj)
     for ( i = 0; i < A->n; ++i )
     {
@@ -1218,7 +1217,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
     memcpy( U_temp->val, DAD->val, sizeof(real) * (DAD->start[DAD->n]) );
 
     /* L has unit diagonal, by convention */
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) private(i) shared(L_temp)
     for ( i = 0; i < A->n; ++i )
     {
@@ -1228,7 +1227,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
     for ( i = 0; i < sweeps; ++i )
     {
         /* for each nonzero in L */
-        #pragma omp parallel for schedule(guided) \
+        #pragma omp parallel for schedule(static) \
         default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
         for ( j = 0; j < DAD->start[DAD->n]; ++j )
         {
@@ -1278,7 +1277,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
             }
         }
 
-        #pragma omp parallel for schedule(guided) \
+        #pragma omp parallel for schedule(static) \
         default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum)
         for ( j = 0; j < DAD->start[DAD->n]; ++j )
         {
@@ -1329,7 +1328,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol,
     /* apply inverse transformation:
      * since DAD \approx LU, then
      * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */
-    #pragma omp parallel for schedule(guided) \
+    #pragma omp parallel for schedule(static) \
     default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj)
     for ( i = 0; i < DAD->n; ++i )
     {
diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c
index 9d2aea4cf107a48d00113ebe1383f615f637f75a..7821462a05ac2c7224a1a86a96db26d72f15e920 100644
--- a/sPuReMD/src/init_md.c
+++ b/sPuReMD/src/init_md.c
@@ -538,7 +538,7 @@ void Init_Out_Controls(reax_system *system, control_params *control,
         strcpy( temp, control->sim_name );
         strcat( temp, ".log" );
         out_control->log = fopen( temp, "w" );
-        fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n",
+        fprintf( out_control->log, "%-6s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
                  "step", "total", "neighbors", "init", "bonded",
                  "nonbonded", "QEq", "QEq Sort", "S iters", "Pre Comp", "Pre App",
                  "S spmv", "S vec ops", "S orthog", "S tsolve" );
diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h
index b225c508a54f14d86c59a64122f52c2f77babbda..9be7cf757d54471195b5afa892a08d2d6bb8ed53 100644
--- a/sPuReMD/src/mytypes.h
+++ b/sPuReMD/src/mytypes.h
@@ -81,17 +81,17 @@
 #define C_ele          332.06371
 //#define K_B         503.398008   // kcal/mol/K
 #define K_B             0.831687   // amu A^2 / ps^2 / K
-#define F_CONV          1e6 / 48.88821291 / 48.88821291   // --> amu A / ps^2
+#define F_CONV          (1e6 / 48.88821291 / 48.88821291)   // --> amu A / ps^2
 #define E_CONV          0.002391   // amu A^2 / ps^2 --> kcal/mol
 #define EV_to_KCALpMOL 14.400000   // ElectronVolt --> KCAL per MOLe
 #define KCALpMOL_to_EV 23.060549   // 23.020000//KCAL per MOLe --> ElectronVolt
 #define ECxA_to_DEBYE   4.803204      // elem. charge * angstrom -> debye conv
 #define CAL_to_JOULES   4.184000      // CALories --> JOULES
-#define JOULES_to_CAL   1/4.184000    // JOULES --> CALories
+#define JOULES_to_CAL   (1/4.184000)    // JOULES --> CALories
 #define AMU_to_GRAM     1.6605e-24
 #define ANG_to_CM       1.0e-8
 #define AVOGNR          6.0221367e23
-#define P_CONV          1.0e-24 * AVOGNR * JOULES_to_CAL
+#define P_CONV          (1.0e-24 * AVOGNR * JOULES_to_CAL)
 
 #define MAX_STR             1024
 #define MAX_LINE            1024
@@ -115,7 +115,6 @@
 
 #define MAX_ITR             10
 #define RESTART             50
-#define MAX_ROWS_PER_LEVEL  10000 /* triangular solve using level scheduling */
 
 #define ZERO           0.000000000000000e+00
 #define ALMOST_ZERO    1e-10
diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c
index 647b3701a391e75f8623a08fcfe1e6359e86f3ce..579ba6290a5c9812cd8b1342f96f959f597d6709 100644
--- a/sPuReMD/src/print_utils.c
+++ b/sPuReMD/src/print_utils.c
@@ -616,7 +616,7 @@ void Output_Results( reax_system *system, control_params *control,
             f_update = 1;
         else f_update = out_control->energy_update_freq;
 
-        fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.6f%10.6f%10.2f%10.6f%10.6f%10.6f%10.6f%10.6f%10.6f\n",
+        fprintf( out_control->log, "%6d %10.2f %10.2f %10.2f %10.2f %10.2f %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f %10.4f %10.4f %10.4f\n",
                  data->step, t_elapsed / f_update,
                  data->timing.nbrs / f_update,
                  data->timing.init_forces / f_update,
diff --git a/sPuReMD/src/vector.c b/sPuReMD/src/vector.c
index f784ee11e73d4e2ce0b59fd00e67af6d025f903c..1525d94d45428b6eb19e4bf4b5e4ca1e6d97cfee 100644
--- a/sPuReMD/src/vector.c
+++ b/sPuReMD/src/vector.c
@@ -26,7 +26,7 @@ inline int Vector_isZero( const real * const v, const unsigned int k )
 {
     unsigned int i, ret = TRUE;
 
-    #pragma omp parallel for default(none) private(i) reduction(&&: ret) schedule(guided)
+    #pragma omp parallel for default(none) private(i) reduction(&&: ret) schedule(static)
     for ( i = 0; i < k; ++i )
     {
         if ( fabs( v[i] ) > ALMOST_ZERO )
@@ -65,7 +65,7 @@ inline void Vector_Scale( real * const dest, const real c, const real * const v,
 {
     unsigned int i;
 
-    #pragma omp parallel for default(none) private(i) schedule(guided)
+    #pragma omp parallel for default(none) private(i) schedule(static)
     for ( i = 0; i < k; ++i )
     {
         dest[i] = c * v[i];
@@ -78,7 +78,7 @@ inline void Vector_Sum( real * const dest, const real c, const real * const v, c
 {
     unsigned int i;
 
-    #pragma omp parallel for default(none) private(i) schedule(guided)
+    #pragma omp parallel for default(none) private(i) schedule(static)
     for ( i = 0; i < k; ++i )
     {
         dest[i] = c * v[i] + d * y[i];
@@ -90,7 +90,7 @@ inline void Vector_Add( real * const dest, const real c, const real * const v, c
 {
     unsigned int i;
 
-    #pragma omp parallel for default(none) private(i) schedule(guided)
+    #pragma omp parallel for default(none) private(i) schedule(static)
     for ( i = 0; i < k; ++i )
     {
         dest[i] += c * v[i];
@@ -117,7 +117,7 @@ inline real Dot( const real * const v1, const real * const v2, const unsigned in
     real ret = ZERO;
     unsigned int i;
 
-    #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(guided)
+    #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(static)
     for ( i = 0; i < k; ++i )
     {
         ret +=  v1[i] * v2[i];
@@ -132,7 +132,7 @@ inline real Norm( const real * const v1, const unsigned int k )
     real ret = ZERO;
     unsigned int i;
 
-    #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(guided)
+    #pragma omp parallel for default(none) private(i) reduction(+: ret) schedule(static)
     for ( i = 0; i < k; ++i )
     {
         ret +=  SQR( v1[i] );
diff --git a/tools/run_sim.py b/tools/run_sim.py
index 842beacfbf08a38c74065d97b0d638f87fcae19e..8bcf19bd89e14865d51da49dfdaf316e61e8e1c0 100644
--- a/tools/run_sim.py
+++ b/tools/run_sim.py
@@ -115,10 +115,7 @@ class TestCase():
                     print(stderr)
 
             else:
-                #TODO: fix
-                start = 0.
-                stop = 0.
-                self._process_result(fout, stop - start, param_dict)
+                self._process_result(fout, param_dict)
 
         fout.close()
         if path.exists(temp_file):
@@ -126,7 +123,8 @@ class TestCase():
         if path.exists(temp_dir):
             rmdir(temp_dir)
 
-    def _process_result(self, fout, time, param):
+    def _process_result(self, fout, param):
+        time = 0.
         qeq = 0.
         iters = 0.
         pre_comp = 0.
@@ -143,25 +141,35 @@ class TestCase():
                 line = line.split()
                 try:
                     qeq = qeq + float(line[6])
-                    iters = iters + float(line[7])
-                    pre_comp = pre_comp + float(line[8])
-                    pre_app = pre_app + float(line[9])
-                    spmv = spmv + float(line[10])
+                    iters = iters + float(line[8])
+                    pre_comp = pre_comp + float(line[9])
+                    pre_app = pre_app + float(line[10])
+                    spmv = spmv + float(line[11])
                     cnt = cnt + 1
+                    pass
                 except Exception:
                     pass
+                if line[0] == 'total:':
+                    try:
+                        time = float(line[1])
+                    except Exception:
+                        pass
             cnt = cnt - 1
-            qeq = qeq / cnt
-            iters = iters / cnt
-            pre_comp = pre_comp / cnt
-            pre_app = pre_app / cnt
-            spmv = spmv / cnt
+            if cnt > 0:
+                qeq = qeq / cnt
+                iters = iters / cnt
+                pre_comp = pre_comp / cnt
+                pre_app = pre_app / cnt
+                spmv = spmv / cnt
 
-        fout.write(self.__result_body_fmt.format(path.basename(self.__geo_file).split('.')[0], 
-            param['nsteps'], param['qeq_solver_type'], param['qeq_solver_q_err'],
- 	    param['pre_comp_type'], param['pre_comp_droptol'], param['pre_comp_sweeps'],
-	    param['pre_app_type'], param['pre_app_jacobi_iters'], pre_comp, pre_app, iters, spmv,
-            qeq, param['threads'], time))
+        if cnt == int(param['nsteps']):
+            fout.write(self.__result_body_fmt.format(path.basename(self.__geo_file).split('.')[0], 
+                param['nsteps'], param['qeq_solver_type'], param['qeq_solver_q_err'],
+     	    param['pre_comp_type'], param['pre_comp_droptol'], param['pre_comp_sweeps'],
+    	    param['pre_app_type'], param['pre_app_jacobi_iters'], pre_comp, pre_app, iters, spmv,
+                qeq, param['threads'], time))
+        else:
+            print('**WARNING: nsteps not correct in file {0}...'.format(log_file))
         fout.flush()
 
 
@@ -193,7 +201,7 @@ if __name__ == '__main__':
     data_dir = path.join(base_dir, 'data/benchmarks')
 
     header_fmt_str = '{:15}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:5}|{:10}|{:10}|{:10}|{:10}|{:10}|{:3}|{:10}\n'
-    header_str = ['Data Set', 'Steps', 'Q Tol', 'QType', 'PreCT', 'PreCD', 'PreCS', 'PreAT', 'PreAJ', 'Pre Comp',
+    header_str = ['Data Set', 'Steps', 'QType', 'Q Tol', 'PreCT', 'PreCD', 'PreCS', 'PreAT', 'PreAJ', 'Pre Comp',
             'Pre App', 'Iters', 'SpMV', 'QEq', 'Thd', 'Time (s)']
     body_fmt_str = '{:15} {:5} {:5} {:5} {:5} {:5} {:5} {:5} {:5} {:10.3f} {:10.3f} {:10.3f} {:10.3f} {:10.3f} {:3} {:10.3f}\n'