diff --git a/configure.ac b/configure.ac
index 7081dd3468cb9413c44e33fbd444b6d76f62970e..2488af52296535e6ec37ca3c3d687e7f905d78a7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,10 +81,11 @@ AC_ARG_ENABLE([debug],
 	      )
 if test "x$DEBUG" = "xyes"
 then
-	#TODO: fix exporting to subdirs
-	# See: http://stackoverflow.com/questions/34124337/changing-flags-in-configure-ac-vs-caching-with-subprojects
-	CFLAGS="-g3 -O0 -D_GLIBCXX_DEBUG ${CFLAGS}"
+#	#TODO: fix exporting to subdirs
+#	# See: http://stackoverflow.com/questions/34124337/changing-flags-in-configure-ac-vs-caching-with-subprojects
+#	CFLAGS="-g3 -O0 -D_GLIBCXX_DEBUG ${CFLAGS}"
 	export BUILD_DEBUG="true"
+	export DEBUG_FLAGS="-g2 -O0 -D_GLIBCXX_DEBUG"
 fi
 
 # gprof flags.
@@ -93,35 +94,22 @@ AC_ARG_ENABLE([gprof],
 		[enable support for profiling with gprof @<:@default: no@:>@])],
 	[case "${enableval}" in
 		gnu | yes) 
-			gprof_enabled="yes" 
-			gprof_compiler="gnu compiler"
-			gprof_flags="-pg"
+			# GNU
+			export BUILD_GPROF="true"
+			export GPROF_FLAGS="-pg"
 			;;
 		intel)
-			gprof_enabled="yes"
-			gprof_compiler="intel compiler"
-			gprof_flags="-p"
+			# Intel
+			export BUILD_GPROF="true"
+			export GPROF_FLAGS="-p"
 			;;
 		no)
-			gprof_enabled="no" 
-			gprof_compiler="none"
-			gprof_flags=""
 			;;
 		*)
 			AC_MSG_ERROR([bad value ${enableval} for --enable-gprof (only yes, gnu or intel are possible)]) ;;
 	esac],
-	[gprof_enabled="no"
-	 gprof_compiler="none"
-	 gprof_flags=""
-	]	      
+	[]	      
 )
-if test "x$gprof_enabled" = "xyes"
-then
-	#TODO: fix exporting to subdirs
-	CPPFLAGS="${CPPFLAGS} ${gprof_flags}"
-	LDFLAGS="${LDFLAGS} ${gprof_flags}"
-	export BUILD_PROF="yes"
-fi
 
 # Timing measurements.
 AC_ARG_ENABLE([timing],
diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac
index 2f0f762639136ddbbbbb5ce01b0582cd5d801dc3..578c00d4de6ee81ee70318e79c0dc266129d2b19 100644
--- a/sPuReMD/configure.ac
+++ b/sPuReMD/configure.ac
@@ -47,10 +47,16 @@ AC_CHECK_FUNCS([gettimeofday memset])
 # Check for compiler vendor
 AX_COMPILER_VENDOR
 if test "x$ax_cv_c_compiler_vendor" = "xgnu"; then
-	CFLAGS="$CFLAGS -Wall -O3 -funroll-loops -fstrict-aliasing"
+	if test "x$BUILD_DEBUG" = "x"; then
+		CFLAGS="$CFLAGS -Wall -O3 -funroll-loops -fstrict-aliasing"
+	else
+		CFLAGS="$CFLAGS -Wall"
+	fi
 fi
 if test "x$ax_cv_c_compiler_vendor" = "xintel"; then
-	CFLAGS="$CFLAGS -fast"
+	if test "x$BUILD_DEBUG" = "x"; then
+		CFLAGS="$CFLAGS -fast"
+	fi
 fi
 
 # Check for OpenMP support.
@@ -113,6 +119,16 @@ then
 	AC_DEFINE([HAVE_SUPERLU_MT], [1], [Define to 1 if you have SuperLU_MT support enabled.])
 fi
 
+if test "x$BUILD_DEBUG" != "x"
+then
+	CFLAGS="${CFLAGS} ${DEBUG_FLAGS}"
+fi
+
+if test "x$BUILD_GPROF" != "x"
+then
+	CFLAGS="${CFLAGS} ${GPROF_FLAGS}"
+fi
+
 AC_CONFIG_FILES([Makefile])
 
 AC_OUTPUT
diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c
index 5eb6d707eaa533eb0bcf4c284a785fd9fc83771c..026a3ae1a84e4912958737161818aea77080da44 100644
--- a/sPuReMD/src/QEq.c
+++ b/sPuReMD/src/QEq.c
@@ -705,20 +705,21 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol,
 //    fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
 
     /* U = L^T (Cholesky factorization) */
-    for ( i = 1; i <= U->n; ++i )
-    {
-        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
-    }
-    for ( i = 0; i < L->n; ++i )
-    {
-        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
-        {
-            j = L->j[pj];
-            U->j[Utop[j]] = i;
-            U->val[Utop[j]] = L->val[pj];
-            Utop[j]++;
-        }
-    }
+    Transpose( L, U );
+//    for ( i = 1; i <= U->n; ++i )
+//    {
+//        Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1;
+//    }
+//    for ( i = 0; i < L->n; ++i )
+//    {
+//        for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj )
+//        {
+//            j = L->j[pj];
+//            U->j[Utop[j]] = i;
+//            U->val[Utop[j]] = L->val[pj];
+//            Utop[j]++;
+//        }
+//    }
 
 //    fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
 
@@ -1385,22 +1386,24 @@ static void Init_MatVec( const reax_system * const system, const control_params
         time = Get_Time( );
         if ( control->pre_comp_type != DIAG_PC )
         {
+            Sort_Matrix_Rows( workspace->H );
+            if ( control->qeq_domain_sparsify_enabled == TRUE )
+            {
+                Sort_Matrix_Rows( workspace->H_sp );
+            }
+
             if ( control->pre_app_type == TRI_SOLVE_GC_PA )
             {
                 if ( control->qeq_domain_sparsify_enabled == TRUE )
                 {
-                    setup_graph_coloring( workspace->H_sp );
+                    Hptr = setup_graph_coloring( workspace->H_sp );
                 }
                 else
                 {
-                    setup_graph_coloring( workspace->H );
+                    Hptr = setup_graph_coloring( workspace->H );
                 }
-            }
 
-            Sort_Matrix_Rows( workspace->H );
-            if ( control->qeq_domain_sparsify_enabled == TRUE )
-            {
-                Sort_Matrix_Rows( workspace->H_sp );
+                Sort_Matrix_Rows( Hptr );
             }
         }
         data->timing.QEq_sort_mat_rows += Get_Timing_Info( time );
@@ -1425,9 +1428,11 @@ static void Init_MatVec( const reax_system * const system, const control_params
 
         case ICHOLT_PC:
             Calculate_Droptol( Hptr, workspace->droptol, control->pre_comp_droptol );
+
 #if defined(DEBUG_FOCUS)
             fprintf( stderr, "drop tolerances calculated\n" );
 #endif
+
             if ( workspace->L == NULL )
             {
                 fillin = Estimate_LU_Fill( Hptr, workspace->droptol );
@@ -1437,6 +1442,7 @@ static void Init_MatVec( const reax_system * const system, const control_params
                     fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" );
                     exit( INSUFFICIENT_MEMORY );
                 }
+
 #if defined(DEBUG)
                 fprintf( stderr, "fillin = %d\n", fillin );
                 fprintf( stderr, "allocated memory: L = U = %ldMB\n",
diff --git a/sPuReMD/src/grid.c b/sPuReMD/src/grid.c
index bd678981a85f4cbc475ccc93ac1e497d49ac13c0..c45c1a2214d7589b3e4c1a647ebde1b9d3c7e56d 100644
--- a/sPuReMD/src/grid.c
+++ b/sPuReMD/src/grid.c
@@ -484,7 +484,7 @@ void Bin_Atoms( reax_system* system, static_storage *workspace )
 }
 
 
-inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
+static inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 {
     dest->type = src->type;
     rvec_Copy( dest->x, src->x );
diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c
index 6ea274d9a07698143a9df17682a06c8826fdceae..1759b41c76969a8ec64047d60c744f6a5771abbb 100644
--- a/sPuReMD/src/lin_alg.c
+++ b/sPuReMD/src/lin_alg.c
@@ -64,6 +64,7 @@ real *y_p = NULL;
 real *x_p = NULL;
 unsigned int *mapping = NULL;
 sparse_matrix *H_full;
+sparse_matrix *H_p;
 /* global to make OpenMP shared (jacobi_iter) */
 real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL;
 
@@ -149,12 +150,9 @@ static void Sparse_MatVec( const sparse_matrix * const A,
 
 /* Transpose A and copy into A^T
  *
- * A: symmetric, lower triangular (half-format), stored in CSR
- * A_t: symmetric, upper triangular (half-format), stored in CSR
- *
- * Assumptions:
- *   A has non-zero diagonals
- *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
+ * A: stored in CSR
+ * A_t: stored in CSR
+ */
 void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
 {
     unsigned int i, j, pj, *A_t_top;
@@ -167,7 +165,7 @@ void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
 
     memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) );
 
-    /* count nonzeros in each column of A^T */
+    /* count nonzeros in each column of A^T, store one row greater (see next loop) */
     for ( i = 0; i < A->n; ++i )
     {
         for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj )
@@ -200,11 +198,8 @@ void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t )
 
 /* Transpose A in-place
  *
- * A: symmetric, lower triangular (half-format), stored in CSR
- *
- * Assumptions:
- *   A has non-zero diagonals
- *   Each row of A has at least one non-zero (i.e., no rows with all zeros) */
+ * A: stored in CSR
+ */
 void Transpose_I( sparse_matrix * const A )
 {
     sparse_matrix * A_t;
@@ -880,7 +875,7 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri )
  * H: symmetric, lower triangular portion only, stored in CSR format;
  *  H is permuted in-place
  */
-void setup_graph_coloring( sparse_matrix * const H )
+sparse_matrix * setup_graph_coloring( sparse_matrix * const H )
 {
     if ( color == NULL )
     {
@@ -893,6 +888,7 @@ void setup_graph_coloring( sparse_matrix * const H )
                 (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (permuted_row_col_inv = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL ||
                 (y_p = (real*) malloc(sizeof(real) * H->n)) == NULL ||
+                (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) ||
                 (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) )
         {
             fprintf( stderr, "not enough memory for graph coloring. terminating.\n" );
@@ -904,8 +900,13 @@ void setup_graph_coloring( sparse_matrix * const H )
 
     graph_coloring( H_full, LOWER );
     sort_colors( H_full->n, LOWER );
+    
+    memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) );
+    memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) );
+    memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) );
+    permute_matrix( H_p, LOWER );
 
-    permute_matrix( H, LOWER );
+    return H_p;
 }
 
 
@@ -921,8 +922,8 @@ void setup_graph_coloring( sparse_matrix * const H )
  * Note: Newmann series arises from series expansion of the inverse of
  * the coefficient matrix in the triangular system */
 static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv,
-                         const real * const b, real * const x, const TRIANGULARITY tri,
-                         const unsigned int maxiter )
+        const real * const b, real * const x, const TRIANGULARITY tri, const
+        unsigned int maxiter )
 {
     unsigned int i, k, si = 0, ei = 0, iter;
 
diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h
index c07494edcd737ccc97780d369296a07667556a6f..fe2d644cae630be6414944989b52adae8a6e1d61 100644
--- a/sPuReMD/src/lin_alg.h
+++ b/sPuReMD/src/lin_alg.h
@@ -28,7 +28,7 @@
 void Transpose( const sparse_matrix const *, sparse_matrix const * );
 void Transpose_I( sparse_matrix * const );
 
-void setup_graph_coloring( sparse_matrix * const );
+sparse_matrix * setup_graph_coloring( sparse_matrix * const );
 
 int GMRES( const static_storage * const, const control_params * const,
         simulation_data * const, const sparse_matrix * const,
diff --git a/sPuReMD/src/neighbors.c b/sPuReMD/src/neighbors.c
index e01b6a7934f1ec5e693cb880dd07c6035c49070b..fc318ef2d7d7022fc97df29548017299cd1f5e63 100644
--- a/sPuReMD/src/neighbors.c
+++ b/sPuReMD/src/neighbors.c
@@ -29,7 +29,7 @@
 
 
 
-inline real DistSqr_to_CP( rvec cp, rvec x )
+static inline real DistSqr_to_CP( rvec cp, rvec x )
 {
     int  i;
     real d_sqr = 0;