diff --git a/configure.ac b/configure.ac index 7081dd3468cb9413c44e33fbd444b6d76f62970e..2488af52296535e6ec37ca3c3d687e7f905d78a7 100644 --- a/configure.ac +++ b/configure.ac @@ -81,10 +81,11 @@ AC_ARG_ENABLE([debug], ) if test "x$DEBUG" = "xyes" then - #TODO: fix exporting to subdirs - # See: http://stackoverflow.com/questions/34124337/changing-flags-in-configure-ac-vs-caching-with-subprojects - CFLAGS="-g3 -O0 -D_GLIBCXX_DEBUG ${CFLAGS}" +# #TODO: fix exporting to subdirs +# # See: http://stackoverflow.com/questions/34124337/changing-flags-in-configure-ac-vs-caching-with-subprojects +# CFLAGS="-g3 -O0 -D_GLIBCXX_DEBUG ${CFLAGS}" export BUILD_DEBUG="true" + export DEBUG_FLAGS="-g2 -O0 -D_GLIBCXX_DEBUG" fi # gprof flags. @@ -93,35 +94,22 @@ AC_ARG_ENABLE([gprof], [enable support for profiling with gprof @<:@default: no@:>@])], [case "${enableval}" in gnu | yes) - gprof_enabled="yes" - gprof_compiler="gnu compiler" - gprof_flags="-pg" + # GNU + export BUILD_GPROF="true" + export GPROF_FLAGS="-pg" ;; intel) - gprof_enabled="yes" - gprof_compiler="intel compiler" - gprof_flags="-p" + # Intel + export BUILD_GPROF="true" + export GPROF_FLAGS="-p" ;; no) - gprof_enabled="no" - gprof_compiler="none" - gprof_flags="" ;; *) AC_MSG_ERROR([bad value ${enableval} for --enable-gprof (only yes, gnu or intel are possible)]) ;; esac], - [gprof_enabled="no" - gprof_compiler="none" - gprof_flags="" - ] + [] ) -if test "x$gprof_enabled" = "xyes" -then - #TODO: fix exporting to subdirs - CPPFLAGS="${CPPFLAGS} ${gprof_flags}" - LDFLAGS="${LDFLAGS} ${gprof_flags}" - export BUILD_PROF="yes" -fi # Timing measurements. AC_ARG_ENABLE([timing], diff --git a/sPuReMD/configure.ac b/sPuReMD/configure.ac index 2f0f762639136ddbbbbb5ce01b0582cd5d801dc3..578c00d4de6ee81ee70318e79c0dc266129d2b19 100644 --- a/sPuReMD/configure.ac +++ b/sPuReMD/configure.ac @@ -47,10 +47,16 @@ AC_CHECK_FUNCS([gettimeofday memset]) # Check for compiler vendor AX_COMPILER_VENDOR if test "x$ax_cv_c_compiler_vendor" = "xgnu"; then - CFLAGS="$CFLAGS -Wall -O3 -funroll-loops -fstrict-aliasing" + if test "x$BUILD_DEBUG" = "x"; then + CFLAGS="$CFLAGS -Wall -O3 -funroll-loops -fstrict-aliasing" + else + CFLAGS="$CFLAGS -Wall" + fi fi if test "x$ax_cv_c_compiler_vendor" = "xintel"; then - CFLAGS="$CFLAGS -fast" + if test "x$BUILD_DEBUG" = "x"; then + CFLAGS="$CFLAGS -fast" + fi fi # Check for OpenMP support. @@ -113,6 +119,16 @@ then AC_DEFINE([HAVE_SUPERLU_MT], [1], [Define to 1 if you have SuperLU_MT support enabled.]) fi +if test "x$BUILD_DEBUG" != "x" +then + CFLAGS="${CFLAGS} ${DEBUG_FLAGS}" +fi + +if test "x$BUILD_GPROF" != "x" +then + CFLAGS="${CFLAGS} ${GPROF_FLAGS}" +fi + AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c index 5eb6d707eaa533eb0bcf4c284a785fd9fc83771c..026a3ae1a84e4912958737161818aea77080da44 100644 --- a/sPuReMD/src/QEq.c +++ b/sPuReMD/src/QEq.c @@ -705,20 +705,21 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, // fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); /* U = L^T (Cholesky factorization) */ - for ( i = 1; i <= U->n; ++i ) - { - Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; - } - for ( i = 0; i < L->n; ++i ) - { - for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) - { - j = L->j[pj]; - U->j[Utop[j]] = i; - U->val[Utop[j]] = L->val[pj]; - Utop[j]++; - } - } + Transpose( L, U ); +// for ( i = 1; i <= U->n; ++i ) +// { +// Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; +// } +// for ( i = 0; i < L->n; ++i ) +// { +// for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) +// { +// j = L->j[pj]; +// U->j[Utop[j]] = i; +// U->val[Utop[j]] = L->val[pj]; +// Utop[j]++; +// } +// } // fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); @@ -1385,22 +1386,24 @@ static void Init_MatVec( const reax_system * const system, const control_params time = Get_Time( ); if ( control->pre_comp_type != DIAG_PC ) { + Sort_Matrix_Rows( workspace->H ); + if ( control->qeq_domain_sparsify_enabled == TRUE ) + { + Sort_Matrix_Rows( workspace->H_sp ); + } + if ( control->pre_app_type == TRI_SOLVE_GC_PA ) { if ( control->qeq_domain_sparsify_enabled == TRUE ) { - setup_graph_coloring( workspace->H_sp ); + Hptr = setup_graph_coloring( workspace->H_sp ); } else { - setup_graph_coloring( workspace->H ); + Hptr = setup_graph_coloring( workspace->H ); } - } - Sort_Matrix_Rows( workspace->H ); - if ( control->qeq_domain_sparsify_enabled == TRUE ) - { - Sort_Matrix_Rows( workspace->H_sp ); + Sort_Matrix_Rows( Hptr ); } } data->timing.QEq_sort_mat_rows += Get_Timing_Info( time ); @@ -1425,9 +1428,11 @@ static void Init_MatVec( const reax_system * const system, const control_params case ICHOLT_PC: Calculate_Droptol( Hptr, workspace->droptol, control->pre_comp_droptol ); + #if defined(DEBUG_FOCUS) fprintf( stderr, "drop tolerances calculated\n" ); #endif + if ( workspace->L == NULL ) { fillin = Estimate_LU_Fill( Hptr, workspace->droptol ); @@ -1437,6 +1442,7 @@ static void Init_MatVec( const reax_system * const system, const control_params fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); exit( INSUFFICIENT_MEMORY ); } + #if defined(DEBUG) fprintf( stderr, "fillin = %d\n", fillin ); fprintf( stderr, "allocated memory: L = U = %ldMB\n", diff --git a/sPuReMD/src/grid.c b/sPuReMD/src/grid.c index bd678981a85f4cbc475ccc93ac1e497d49ac13c0..c45c1a2214d7589b3e4c1a647ebde1b9d3c7e56d 100644 --- a/sPuReMD/src/grid.c +++ b/sPuReMD/src/grid.c @@ -484,7 +484,7 @@ void Bin_Atoms( reax_system* system, static_storage *workspace ) } -inline void reax_atom_Copy( reax_atom *dest, reax_atom *src ) +static inline void reax_atom_Copy( reax_atom *dest, reax_atom *src ) { dest->type = src->type; rvec_Copy( dest->x, src->x ); diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 6ea274d9a07698143a9df17682a06c8826fdceae..1759b41c76969a8ec64047d60c744f6a5771abbb 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -64,6 +64,7 @@ real *y_p = NULL; real *x_p = NULL; unsigned int *mapping = NULL; sparse_matrix *H_full; +sparse_matrix *H_p; /* global to make OpenMP shared (jacobi_iter) */ real *Dinv_b = NULL, *rp = NULL, *rp2 = NULL, *rp3 = NULL; @@ -149,12 +150,9 @@ static void Sparse_MatVec( const sparse_matrix * const A, /* Transpose A and copy into A^T * - * A: symmetric, lower triangular (half-format), stored in CSR - * A_t: symmetric, upper triangular (half-format), stored in CSR - * - * Assumptions: - * A has non-zero diagonals - * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ + * A: stored in CSR + * A_t: stored in CSR + */ void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t ) { unsigned int i, j, pj, *A_t_top; @@ -167,7 +165,7 @@ void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t ) memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) ); - /* count nonzeros in each column of A^T */ + /* count nonzeros in each column of A^T, store one row greater (see next loop) */ for ( i = 0; i < A->n; ++i ) { for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) @@ -200,11 +198,8 @@ void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t ) /* Transpose A in-place * - * A: symmetric, lower triangular (half-format), stored in CSR - * - * Assumptions: - * A has non-zero diagonals - * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ + * A: stored in CSR + */ void Transpose_I( sparse_matrix * const A ) { sparse_matrix * A_t; @@ -880,7 +875,7 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri ) * H: symmetric, lower triangular portion only, stored in CSR format; * H is permuted in-place */ -void setup_graph_coloring( sparse_matrix * const H ) +sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) { if ( color == NULL ) { @@ -893,6 +888,7 @@ void setup_graph_coloring( sparse_matrix * const H ) (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (permuted_row_col_inv = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (y_p = (real*) malloc(sizeof(real) * H->n)) == NULL || + (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) || (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) ) { fprintf( stderr, "not enough memory for graph coloring. terminating.\n" ); @@ -904,8 +900,13 @@ void setup_graph_coloring( sparse_matrix * const H ) graph_coloring( H_full, LOWER ); sort_colors( H_full->n, LOWER ); + + memcpy( H_p->start, H->start, sizeof(int) * (H->n + 1) ); + memcpy( H_p->j, H->j, sizeof(int) * (H->start[H->n]) ); + memcpy( H_p->val, H->val, sizeof(real) * (H->start[H->n]) ); + permute_matrix( H_p, LOWER ); - permute_matrix( H, LOWER ); + return H_p; } @@ -921,8 +922,8 @@ void setup_graph_coloring( sparse_matrix * const H ) * Note: Newmann series arises from series expansion of the inverse of * the coefficient matrix in the triangular system */ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, - const real * const b, real * const x, const TRIANGULARITY tri, - const unsigned int maxiter ) + const real * const b, real * const x, const TRIANGULARITY tri, const + unsigned int maxiter ) { unsigned int i, k, si = 0, ei = 0, iter; diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h index c07494edcd737ccc97780d369296a07667556a6f..fe2d644cae630be6414944989b52adae8a6e1d61 100644 --- a/sPuReMD/src/lin_alg.h +++ b/sPuReMD/src/lin_alg.h @@ -28,7 +28,7 @@ void Transpose( const sparse_matrix const *, sparse_matrix const * ); void Transpose_I( sparse_matrix * const ); -void setup_graph_coloring( sparse_matrix * const ); +sparse_matrix * setup_graph_coloring( sparse_matrix * const ); int GMRES( const static_storage * const, const control_params * const, simulation_data * const, const sparse_matrix * const, diff --git a/sPuReMD/src/neighbors.c b/sPuReMD/src/neighbors.c index e01b6a7934f1ec5e693cb880dd07c6035c49070b..fc318ef2d7d7022fc97df29548017299cd1f5e63 100644 --- a/sPuReMD/src/neighbors.c +++ b/sPuReMD/src/neighbors.c @@ -29,7 +29,7 @@ -inline real DistSqr_to_CP( rvec cp, rvec x ) +static inline real DistSqr_to_CP( rvec cp, rvec x ) { int i; real d_sqr = 0;