diff --git a/sPuReMD/src/GMRES.c b/sPuReMD/src/GMRES.c index 2cb7c042bc187c8713641d00d64666e3423a9151..b4387244785fb56c1869d1f5937ba0c247ad6cde 100644 --- a/sPuReMD/src/GMRES.c +++ b/sPuReMD/src/GMRES.c @@ -115,95 +115,6 @@ static void Sparse_MatVec( const sparse_matrix * const A, } -/* sparse matrix-vector product Gx=b (for Jacobi iteration), - * followed by vector addition of D^{-1}b, - * where G = (I - D^{-1}R) (G not explicitly computed and stored) - * R: strictly triangular matrix (diagonals not used) - * tri: triangularity of A (lower/upper) - * D^{-1} (D_inv): inverse of the diagonal of R - * x: vector - * b: vector (result) - * D^{-1}b (Dinv_b): precomputed vector-vector product */ -static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R, - const TRIANGULARITY tri, const real * const Dinv, - const real * const x, real * const b, const real * const Dinv_b) -{ - int i, k, si = 0, ei = 0; -#ifdef _OPENMP - static real *b_local; - unsigned int tid; -#endif - - Vector_MakeZero( b, R->n ); - - #pragma omp parallel \ - default(none) shared(b_local) private(si, ei, i, k, tid) - { -#ifdef _OPENMP - tid = omp_get_thread_num(); - - #pragma omp master - { - /* keep b_local for program duration to avoid allocate/free - * overhead per Sparse_MatVec call*/ - if ( b_local == NULL ) - { - if ( (b_local = (real*) malloc( omp_get_num_threads() * R->n * sizeof(real))) == NULL ) - { - exit( INSUFFICIENT_MEMORY ); - } - } - - Vector_MakeZero( b_local, omp_get_num_threads() * R->n ); - } - #pragma omp barrier - -#endif - #pragma omp for schedule(guided) - for ( i = 0; i < R->n; ++i ) - { - if (tri == LOWER) - { - si = R->start[i]; - ei = R->start[i + 1] - 1; - } - else if (tri == UPPER) - { - - si = R->start[i] + 1; - ei = R->start[i + 1]; - } - - for ( k = si; k < ei; ++k ) - { -#ifdef _OPENMP - b_local[tid * R->n + i] += R->val[k] * x[R->j[k]]; -#else - b[i] += R->val[k] * x[R->j[k]]; -#endif - } -#ifdef _OPENMP - b_local[tid * R->n + i] *= -Dinv[i]; -#else - b[i] *= -Dinv[i]; -#endif - } -#ifdef _OPENMP - #pragma omp for schedule(guided) - for ( i = 0; i < R->n; ++i ) - { - for ( k = 0; k < omp_get_num_threads(); ++k ) - { - b[i] += b_local[k * R->n + i]; - } - - b[i] += Dinv_b[i]; - } -#endif - } -} - - static void diag_pre_app( const real * const Hdia_inv, const real * const y, real * const x, const int N ) { @@ -335,7 +246,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real * row_levels[i] = local_level; level_rows[local_level * MAX_ROWS_PER_LEVEL + level_rows_cnt[local_level]] = i; ++level_rows_cnt[local_level]; - if( level_rows_cnt[local_level] >= MAX_ROWS_PER_LEVEL ) + if ( level_rows_cnt[local_level] >= MAX_ROWS_PER_LEVEL ) { fprintf( stderr, "Not enough space for triangular solve via level scheduling" ); fprintf( stderr, " (MAX_ROWS_PER_LEVEL). Terminating...\n" ); @@ -357,7 +268,7 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real * row_levels[i] = local_level; level_rows[local_level * MAX_ROWS_PER_LEVEL + level_rows_cnt[local_level]] = i; ++level_rows_cnt[local_level]; - if( level_rows_cnt[local_level] >= MAX_ROWS_PER_LEVEL ) + if ( level_rows_cnt[local_level] >= MAX_ROWS_PER_LEVEL ) { fprintf( stderr, "Not enough space for triangular solve via level scheduling" ); fprintf( stderr, " (MAX_ROWS_PER_LEVEL). Terminating...\n" ); @@ -368,41 +279,42 @@ static void tri_solve_level_sched( const sparse_matrix * const LU, const real * } /* perform substitutions by level */ - if ( tri == LOWER ) + #pragma omp parallel default(none) private(i, j, pj, local_row) shared(levels, level_rows_cnt, level_rows) { - for ( i = 0; i < levels; ++i ) + if ( tri == LOWER ) { - #pragma omp parallel for schedule(guided) \ - default(none) private(j, pj, local_row) shared(stderr, i, level_rows_cnt, level_rows) - for ( j = 0; j < level_rows_cnt[i]; ++j ) + for ( i = 0; i < levels; ++i ) { - local_row = level_rows[i * MAX_ROWS_PER_LEVEL + j]; - x[local_row] = y[local_row]; - for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj ) + #pragma omp for schedule(static) + for ( j = 0; j < level_rows_cnt[i]; ++j ) { - x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + local_row = level_rows[i * MAX_ROWS_PER_LEVEL + j]; + x[local_row] = y[local_row]; + for ( pj = LU->start[local_row]; pj < LU->start[local_row + 1] - 1; ++pj ) + { + x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + } + x[local_row] /= LU->val[pj]; } - x[local_row] /= LU->val[pj]; } } - } - else - { - for ( i = 0; i < levels; ++i ) + else { - #pragma omp parallel for schedule(guided) \ - default(none) private(j, pj, local_row) shared(i, level_rows_cnt, level_rows) - for ( j = 0; j < level_rows_cnt[i]; ++j ) + for ( i = 0; i < levels; ++i ) { - local_row = level_rows[i * MAX_ROWS_PER_LEVEL + j]; - x[local_row] = y[local_row]; - for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj ) + #pragma omp for schedule(static) + for ( j = 0; j < level_rows_cnt[i]; ++j ) { - x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + local_row = level_rows[i * MAX_ROWS_PER_LEVEL + j]; + x[local_row] = y[local_row]; + for ( pj = LU->start[local_row] + 1; pj < LU->start[local_row + 1]; ++pj ) + { + x[local_row] -= LU->val[pj] * x[LU->j[pj]]; + } + x[local_row] /= LU->val[LU->start[local_row]]; } - x[local_row] /= LU->val[LU->start[local_row]]; } } } @@ -510,7 +422,6 @@ static void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, do { // x_{k+1} = G*x_{k} + Dinv*b; - //Sparse_MatVec_Vector_Add( (sparse_matrix*)R, tri, Dinv, rp, rp2, Dinv_b ); #pragma omp master { Vector_MakeZero( rp2, R->n ); @@ -711,19 +622,25 @@ void apply_preconditioner( const static_storage * const workspace, const control /* generalized minimual residual iterative solver for sparse linear systems */ int GMRES( const static_storage * const workspace, const control_params * const control, - const sparse_matrix * const H, const real * const b, real tol, real * const x, - const FILE * const fout, real * const time, real * const spmv_time, const int fresh_pre ) + simulation_data * const data, const sparse_matrix * const H, + const real * const b, real tol, real * const x, + const FILE * const fout, const int fresh_pre ) { int i, j, k, itr, N, si; real cc, tmp1, tmp2, temp, bnorm, time_start; N = H->n; + + time_start = Get_Time( ); bnorm = Norm( b, N ); + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); if ( control->pre_comp_type == DIAG_PC ) { /* apply preconditioner to RHS */ + time_start = Get_Time( ); apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre ); + data->timing.pre_app += Get_Timing_Info( time_start ); } /* GMRES outer-loop */ @@ -732,34 +649,40 @@ int GMRES( const static_storage * const workspace, const control_params * const /* calculate r0 */ time_start = Get_Time( ); Sparse_MatVec( H, x, workspace->b_prm ); - *spmv_time += Get_Timing_Info( time_start ); + data->timing.solver_spmv += Get_Timing_Info( time_start ); if ( control->pre_comp_type == DIAG_PC ) { time_start = Get_Time( ); apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, fresh_pre ); - *time += Get_Timing_Info( time_start ); + data->timing.pre_app += Get_Timing_Info( time_start ); } if ( control->pre_comp_type == DIAG_PC ) { + time_start = Get_Time( ); Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } else { + time_start = Get_Time( ); Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } if ( control->pre_comp_type != DIAG_PC ) { time_start = Get_Time( ); apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0], - itr == 0 ? fresh_pre : 0 ); - *time += Get_Timing_Info( time_start ); + itr == 0 ? fresh_pre : 0 ); + data->timing.pre_app += Get_Timing_Info( time_start ); } + time_start = Get_Time( ); workspace->g[0] = Norm( workspace->v[0], N ); Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N ); + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); /* GMRES inner-loop */ @@ -768,25 +691,28 @@ int GMRES( const static_storage * const workspace, const control_params * const /* matvec */ time_start = Get_Time( ); Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); - *spmv_time += Get_Timing_Info( time_start ); + data->timing.solver_spmv += Get_Timing_Info( time_start ); time_start = Get_Time( ); apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], 0 ); - *time += Get_Timing_Info( time_start ); + data->timing.pre_app += Get_Timing_Info( time_start ); if ( control->pre_comp_type == DIAG_PC ) { /* apply modified Gram-Schmidt to orthogonalize the new residual */ + time_start = Get_Time( ); for ( i = 0; i <= j; i++ ) { workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N ); Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); } + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } else { //TODO: investigate correctness of not explicitly orthogonalizing first few vectors /* apply modified Gram-Schmidt to orthogonalize the new residual */ + time_start = Get_Time( ); for ( i = 0; i < j - 1; i++ ) { workspace->h[i][j] = 0; @@ -797,15 +723,19 @@ int GMRES( const static_storage * const workspace, const control_params * const workspace->h[i][j] = Dot( workspace->v[i], workspace->v[j + 1], N ); Vector_Add( workspace->v[j + 1], -workspace->h[i][j], workspace->v[i], N ); } + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); } + time_start = Get_Time( ); workspace->h[j + 1][j] = Norm( workspace->v[j + 1], N ); Vector_Scale( workspace->v[j + 1], - 1. / workspace->h[j + 1][j], workspace->v[j + 1], N ); + 1. / workspace->h[j + 1][j], workspace->v[j + 1], N ); + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); #if defined(DEBUG) fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); #endif + time_start = Get_Time( ); if ( control->pre_comp_type == DIAG_PC ) { /* Givens rotations on the upper-Hessenberg matrix to make it U */ @@ -855,6 +785,7 @@ int GMRES( const static_storage * const workspace, const control_params * const tmp2 = -workspace->hs[j] * workspace->g[j]; workspace->g[j] = tmp1; workspace->g[j + 1] = tmp2; + data->timing.solver_orthog += Get_Timing_Info( time_start ); //fprintf( stderr, "h: " ); //for( i = 0; i <= j+1; ++i ) @@ -865,6 +796,7 @@ int GMRES( const static_storage * const workspace, const control_params * const /* TODO: solve using Jacobi iteration? */ /* solve Hy = g: H is now upper-triangular, do back-substitution */ + time_start = Get_Time( ); for ( i = j - 1; i >= 0; i-- ) { temp = workspace->g[i]; @@ -875,8 +807,10 @@ int GMRES( const static_storage * const workspace, const control_params * const workspace->y[i] = temp / workspace->h[i][i]; } + data->timing.solver_tri_solve += Get_Timing_Info( time_start ); /* update x = x_0 + Vy */ + time_start = Get_Time( ); Vector_MakeZero( workspace->p, N ); for ( i = 0; i < j; i++ ) { @@ -884,6 +818,7 @@ int GMRES( const static_storage * const workspace, const control_params * const } Vector_Add( x, 1., workspace->p, N ); + data->timing.solver_vector_ops += Get_Timing_Info( time_start ); /* stopping condition */ if ( fabs(workspace->g[j]) / bnorm <= tol ) @@ -902,7 +837,7 @@ int GMRES( const static_storage * const workspace, const control_params * const // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", // itr, j, fabs( workspace->g[j] ) / bnorm ); - // data->timing.matvec += itr * RESTART + j; + // data->timing.solver_iters += itr * RESTART + j; if ( itr >= MAX_ITR ) { @@ -916,8 +851,9 @@ int GMRES( const static_storage * const workspace, const control_params * const int GMRES_HouseHolder( const static_storage * const workspace, const control_params * const control, - const sparse_matrix * const H, const real * const b, real tol, real * const x, - const FILE * const fout, real * const time, real * const spmv_time, const int fresh_pre ) + simulation_data * const data, const sparse_matrix * const H, + const real * const b, real tol, real * const x, + const FILE * const fout, const int fresh_pre ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; diff --git a/sPuReMD/src/GMRES.h b/sPuReMD/src/GMRES.h index 4c2d44ad901761a9433b90db28857b0aacee7790..77fa3e6abb5d1578d5d65dbebea935f00e64b801 100644 --- a/sPuReMD/src/GMRES.h +++ b/sPuReMD/src/GMRES.h @@ -25,12 +25,14 @@ #include "mytypes.h" int GMRES( const static_storage * const, const control_params * const, - const sparse_matrix * const, const real * const, real, real * const, - const FILE * const, real * const, real * const, const int ); + simulation_data * const, const sparse_matrix * const, + const real * const, real, real * const, + const FILE * const, const int ); int GMRES_HouseHolder( const static_storage * const, const control_params * const, - const sparse_matrix * const, const real * const, real, real * const, - const FILE * const, real * const, real * const, const int ); + simulation_data * const, const sparse_matrix * const, + const real * const, real, real * const, + const FILE * const, const int ); int CG( static_storage*, sparse_matrix*, real*, real, real*, FILE* ); diff --git a/sPuReMD/src/QEq.c b/sPuReMD/src/QEq.c index 5a757f9ac2008f65d715931c36d0df4c1eb0c947..5c5b47791b965846fddded0fa24e82b8c09ccb9c 100644 --- a/sPuReMD/src/QEq.c +++ b/sPuReMD/src/QEq.c @@ -42,45 +42,49 @@ static void Sort_Matrix_Rows( sparse_matrix * const A ) int i, j, k, si, ei, *temp_j; real *temp_val; - if ( ( temp_j = (int*) malloc( A->n * sizeof(int)) ) == NULL - || ( temp_val = (real*) malloc( A->n * sizeof(real)) ) == NULL ) + #pragma omp parallel default(none) private(i, j, k, si, ei, temp_j, temp_val) shared(stderr) { - fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); - } + if ( ( temp_j = (int*) malloc( A->n * sizeof(int)) ) == NULL + || ( temp_val = (real*) malloc( A->n * sizeof(real)) ) == NULL ) + { + fprintf( stderr, "Not enough space for matrix row sort. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); + } - /* sort each row of A using column indices */ - for ( i = 0; i < A->n; ++i ) - { - si = A->start[i]; - ei = A->start[i + 1]; - memcpy( temp_j, A->j + si, sizeof(int) * (ei - si) ); - memcpy( temp_val, A->val + si, sizeof(real) * (ei - si) ); + /* sort each row of A using column indices */ + #pragma omp for + for ( i = 0; i < A->n; ++i ) + { + si = A->start[i]; + ei = A->start[i + 1]; + memcpy( temp_j, A->j + si, sizeof(int) * (ei - si) ); + memcpy( temp_val, A->val + si, sizeof(real) * (ei - si) ); - //TODO: consider implementing single custom one-pass sort instead of using qsort + manual sort - /* polymorphic sort in standard C library using column indices */ - qsort( temp_j, ei - si, sizeof(int), compare_matrix_entry ); + //TODO: consider implementing single custom one-pass sort instead of using qsort + manual sort + /* polymorphic sort in standard C library using column indices */ + qsort( temp_j, ei - si, sizeof(int), compare_matrix_entry ); - /* manually sort vals */ - for ( j = 0; j < (ei - si); ++j ) - { - for ( k = 0; k < (ei - si); ++k ) + /* manually sort vals */ + for ( j = 0; j < (ei - si); ++j ) { - if ( A->j[si + j] == temp_j[k] ) + for ( k = 0; k < (ei - si); ++k ) { - A->val[si + k] = temp_val[j]; - break; - } + if ( A->j[si + j] == temp_j[k] ) + { + A->val[si + k] = temp_val[j]; + break; + } + } } + + /* copy sorted column indices */ + memcpy( A->j + si, temp_j, sizeof(int) * (ei - si) ); } - - /* copy sorted column indices */ - memcpy( A->j + si, temp_j, sizeof(int) * (ei - si) ); - } - free( temp_val ); - free( temp_j ); + free( temp_val ); + free( temp_j ); + } } @@ -91,8 +95,8 @@ static void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t ) if ( (A_t_top = (unsigned int*) calloc( A->n + 1, sizeof(unsigned int))) == NULL ) { - fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); + fprintf( stderr, "Not enough space for matrix tranpose. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); } memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) ); @@ -102,7 +106,7 @@ static void Transpose( const sparse_matrix const *A, sparse_matrix const *A_t ) { for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) { - ++A_t->start[A->j[pj] + 1]; + ++A_t->start[A->j[pj] + 1]; } } @@ -150,7 +154,7 @@ static void Transpose_I( sparse_matrix * const A ) static void Calculate_Droptol( const sparse_matrix * const A, real * const droptol, - const real dtol ) + const real dtol ) { int i, j, k; real val; @@ -175,8 +179,8 @@ static void Calculate_Droptol( const sparse_matrix * const A, real * const dropt fprintf( stderr, "Not enough space for droptol. Terminating...\n" ); exit( INSUFFICIENT_MEMORY ); } - } - } + } + } #pragma omp barrier #endif @@ -258,7 +262,7 @@ static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const d fillin = 0; #pragma omp parallel for schedule(guided) \ - default(none) private(i, j, pj, val) reduction(+: fillin) + default(none) private(i, j, pj, val) reduction(+: fillin) for ( i = 0; i < A->n; ++i ) { for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) @@ -279,7 +283,7 @@ static int Estimate_LU_Fill( const sparse_matrix * const A, const real * const d #if defined(HAVE_SUPERLU_MT) static real SuperLU_Factorize( const sparse_matrix * const A, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, pj, count, *Ltop, *Utop, r; sparse_matrix *A_t; @@ -300,7 +304,7 @@ static real SuperLU_Factorize( const sparse_matrix * const A, int_t *perm_c; /* column permutation vector */ int_t *perm_r; /* row permutations from partial pivoting */ void *work; - int_t info, lwork; + int_t info, lwork; int_t permc_spec, panel_size, relax; Gstat_t Gstat; flops_t flopcnt; @@ -309,7 +313,7 @@ static real SuperLU_Factorize( const sparse_matrix * const A, #ifdef _OPENMP //TODO: set as global parameter and use #pragma omp parallel \ - default(none) shared(nprocs) + default(none) shared(nprocs) { #pragma omp master { @@ -350,24 +354,24 @@ static real SuperLU_Factorize( const sparse_matrix * const A, } if ( !(superlumt_options.etree = intMalloc(A->n)) ) { - SUPERLU_ABORT("Malloc fails for etree[]."); + SUPERLU_ABORT("Malloc fails for etree[]."); } if ( !(superlumt_options.colcnt_h = intMalloc(A->n)) ) { - SUPERLU_ABORT("Malloc fails for colcnt_h[]."); + SUPERLU_ABORT("Malloc fails for colcnt_h[]."); } if ( !(superlumt_options.part_super_h = intMalloc(A->n)) ) { - SUPERLU_ABORT("Malloc fails for part_super__h[]."); + SUPERLU_ABORT("Malloc fails for part_super__h[]."); } if ( ( (a = (real*) malloc( (2 * A->start[A->n] - A->n) * sizeof(real))) == NULL ) - || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL ) - || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) - || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) - || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) ) + || ( (asub = (int_t*) malloc( (2 * A->start[A->n] - A->n) * sizeof(int_t))) == NULL ) + || ( (xa = (int_t*) malloc( (A->n + 1) * sizeof(int_t))) == NULL ) + || ( (Ltop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) + || ( (Utop = (unsigned int*) malloc( (A->n + 1) * sizeof(unsigned int))) == NULL ) ) { - fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" ); - exit( INSUFFICIENT_MEMORY ); + fprintf( stderr, "Not enough space for SuperLU factorization. Terminating...\n" ); + exit( INSUFFICIENT_MEMORY ); } if ( Allocate_Matrix( &A_t, A->n, A->m ) == 0 ) { @@ -379,16 +383,16 @@ static real SuperLU_Factorize( const sparse_matrix * const A, Transpose( A, A_t ); count = 0; - for( i = 0; i < A->n; ++i ) + for ( i = 0; i < A->n; ++i ) { xa[i] = count; - for( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) { a[count] = A->entries[pj].val; asub[count] = A->entries[pj].j; ++count; } - for( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) + for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) { a[count] = A_t->entries[pj].val; asub[count] = A_t->entries[pj].j; @@ -398,24 +402,24 @@ static real SuperLU_Factorize( const sparse_matrix * const A, xa[i] = count; dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, - &at, &atsub, &xat ); + &at, &atsub, &xat ); - for( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) fprintf( stderr, "%6d", asub[i] ); fprintf( stderr, "\n" ); - for( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) fprintf( stderr, "%6.1f", a[i] ); fprintf( stderr, "\n" ); - for( i = 0; i <= A->n; ++i ) + for ( i = 0; i <= A->n; ++i ) fprintf( stderr, "%6d", xa[i] ); fprintf( stderr, "\n" ); - for( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) fprintf( stderr, "%6d", atsub[i] ); fprintf( stderr, "\n" ); - for( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) + for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) fprintf( stderr, "%6.1f", at[i] ); fprintf( stderr, "\n" ); - for( i = 0; i <= A->n; ++i ) + for ( i = 0; i <= A->n; ++i ) fprintf( stderr, "%6d", xat[i] ); fprintf( stderr, "\n" ); @@ -432,18 +436,18 @@ static real SuperLU_Factorize( const sparse_matrix * const A, A_S_store->colptr = xat; /* ------------------------------------------------------------ - Allocate storage and initialize statistics variables. + Allocate storage and initialize statistics variables. ------------------------------------------------------------*/ StatAlloc( A->n, nprocs, panel_size, relax, &Gstat ); StatInit( A->n, nprocs, &Gstat ); /* ------------------------------------------------------------ Get column permutation vector perm_c[], according to permc_spec: - permc_spec = 0: natural ordering + permc_spec = 0: natural ordering permc_spec = 1: minimum degree ordering on structure of A'*A permc_spec = 2: minimum degree ordering on structure of A'+A permc_spec = 3: approximate minimum degree for unsymmetric matrices - ------------------------------------------------------------*/ + ------------------------------------------------------------*/ permc_spec = 0; get_perm_c( permc_spec, &A_S, perm_c ); @@ -453,10 +457,10 @@ static real SuperLU_Factorize( const sparse_matrix * const A, Apply perm_c to the columns of original A to form AC. ------------------------------------------------------------*/ pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, - u, usepr, drop_tol, perm_c, perm_r, - work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); + u, usepr, drop_tol, perm_c, perm_r, + work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); - for( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) + for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); fprintf( stderr, "\n" ); @@ -467,7 +471,7 @@ static real SuperLU_Factorize( const sparse_matrix * const A, pdgstrf( &superlumt_options, &AC_S, perm_r, &L_S, &U_S, &Gstat, &info ); fprintf( stderr, "INFO: %d\n", info ); - + flopcnt = 0; for (i = 0; i < nprocs; ++i) { @@ -490,15 +494,15 @@ static real SuperLU_Factorize( const sparse_matrix * const A, memset( L->start, 0, (A->n + 1) * sizeof(int) ); memset( U->start, 0, (A->n + 1) * sizeof(int) ); - for( i = 0; i < 2 * L_S_store->nnz; ++i ) + for ( i = 0; i < 2 * L_S_store->nnz; ++i ) fprintf( stderr, "%6.1f", ((real*)(L_S_store->nzval))[i] ); fprintf( stderr, "\n" ); - for( i = 0; i < 2 * U_S_store->nnz; ++i ) + for ( i = 0; i < 2 * U_S_store->nnz; ++i ) fprintf( stderr, "%6.1f", ((real*)(U_S_store->nzval))[i] ); fprintf( stderr, "\n" ); printf( "No of supernodes in factor L = " IFMT "\n", L_S_store->nsuper ); - for( i = 0; i < A->n; ++i ) + for ( i = 0; i < A->n; ++i ) { fprintf( stderr, "nzval_col_beg[%5d] = %d\n", i, L_S_store->nzval_colbeg[i] ); fprintf( stderr, "nzval_col_end[%5d] = %d\n", i, L_S_store->nzval_colend[i] ); @@ -510,20 +514,20 @@ static real SuperLU_Factorize( const sparse_matrix * const A, // } fprintf( stderr, "col_beg[%5d] = %d\n", i, U_S_store->colbeg[i] ); fprintf( stderr, "col_end[%5d] = %d\n", i, U_S_store->colend[i] ); - for( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) { ++Utop[U_S_store->rowind[pj] + 1]; fprintf( stderr, "Utop[%5d] = %d\n", U_S_store->rowind[pj] + 1, Utop[U_S_store->rowind[pj] + 1] ); } } - for( i = 1; i <= A->n; ++i ) + for ( i = 1; i <= A->n; ++i ) { // Ltop[i] = L->start[i] = Ltop[i] + Ltop[i - 1]; Utop[i] = U->start[i] = Utop[i] + Utop[i - 1]; // fprintf( stderr, "Utop[%5d] = %d\n", i, Utop[i] ); // fprintf( stderr, "U->start[%5d] = %d\n", i, U->start[i] ); } - for( i = 0; i < A->n; ++i ) + for ( i = 0; i < A->n; ++i ) { // for( pj = 0; pj < L_S_store->nzval_colend[i] - L_S_store->nzval_colbeg[i]; ++pj ) // { @@ -532,7 +536,7 @@ static real SuperLU_Factorize( const sparse_matrix * const A, // L->entries[Ltop[r]].val = ((real*)L_S_store->nzval)[L_S_store->nzval_colbeg[i] + pj]; // ++Ltop[r]; // } - for( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) + for ( pj = U_S_store->colbeg[i]; pj < U_S_store->colend[i]; ++pj ) { r = U_S_store->rowind[pj]; U->entries[Utop[r]].j = i; @@ -584,7 +588,7 @@ static real diag_pre_comp( const reax_system * const system, real * const Hdia_i start = Get_Time( ); #pragma omp parallel for schedule(guided) \ - default(none) private(i) + default(none) private(i) for ( i = 0; i < system->N; ++i ) { Hdia_inv[i] = 1.0 / system->reaxprm.sbp[system->atoms[i].type].eta; @@ -596,7 +600,7 @@ static real diag_pre_comp( const reax_system * const system, real * const Hdia_i /* Incomplete Cholesky factorization with dual thresholding */ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { int *tmp_j; real *tmp_val; @@ -605,8 +609,8 @@ static real ICHOLT( const sparse_matrix * const A, const real * const droptol, int *Utop; start = Get_Time( ); - - if( ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL || + + if ( ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL || ( tmp_j = (int*) malloc(A->n * sizeof(int)) ) == NULL || ( tmp_val = (real*) malloc(A->n * sizeof(real)) ) == NULL ) { @@ -762,7 +766,7 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, exit( INSUFFICIENT_MEMORY ); } - if( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL || ( Utop = (int*) malloc((A->n + 1) * sizeof(int)) ) == NULL ) { @@ -808,7 +812,7 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero */ #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) + default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) for ( j = 0; j < A->start[A->n]; ++j ) { sum = ZERO; @@ -861,7 +865,7 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n"); #if defined(DEBUG_FOCUS) fprintf( stderr, "A(%5d,%5d) = %10.3f\n", - k - 1, A->entries[j].j, A->entries[j].val ); + k - 1, A->entries[j].j, A->entries[j].val ); fprintf( stderr, "sum = %10.3f\n", sum); #endif exit(NUMERIC_BREAKDOWN); @@ -936,17 +940,17 @@ static real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, /* Fine-grained (parallel) incomplete LU factorization - * + * * Reference: * Edmond Chow and Aftab Patel * Fine-Grained Parallel Incomplete LU Factorization * SIAM J. Sci. Comp. - * + * * A: symmetric, half-stored (lower triangular), CSR format * sweeps: number of loops over non-zeros for computation * L / U: factorized triangular matrices (A \approx LU), CSR format */ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, j, k, pj, x, y, ei_x, ei_y; real *D, *D_inv, sum, start; @@ -960,7 +964,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, exit( INSUFFICIENT_MEMORY ); } - if( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) { fprintf( stderr, "not enough memory for ILU_PAR preconditioning matrices. terminating.\n" ); @@ -968,7 +972,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #pragma omp parallel for schedule(guided) \ - default(none) shared(D, D_inv) private(i) + default(none) shared(D, D_inv) private(i) for ( i = 0; i < A->n; ++i ) { D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); @@ -979,7 +983,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, * transformation DAD, where D = D(1./sqrt(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, D) private(i, pj) + default(none) shared(DAD, D) private(i, pj) for ( i = 0; i < A->n; ++i ) { /* non-diagonals */ @@ -1005,7 +1009,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, /* L has unit diagonal, by convention */ #pragma omp parallel for schedule(guided) \ - default(none) private(i) + default(none) private(i) for ( i = 0; i < A->n; ++i ) { L->val[L->start[i + 1] - 1] = 1.0; @@ -1015,7 +1019,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero in L */ #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) for ( j = 0; j < DAD->start[DAD->n]; ++j ) { sum = ZERO; @@ -1058,14 +1062,14 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, } } - if( j != ei_x - 1 ) + if ( j != ei_x - 1 ) { L->val[j] = ( DAD->val[j] - sum ) / U->val[ei_y - 1]; } } #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) + default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) for ( j = 0; j < DAD->start[DAD->n]; ++j ) { sum = ZERO; @@ -1116,7 +1120,7 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, * since DAD \approx LU, then * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, D_inv) private(i, pj) + default(none) shared(DAD, D_inv) private(i, pj) for ( i = 0; i < DAD->n; ++i ) { for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) @@ -1143,18 +1147,18 @@ static real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, /* Fine-grained (parallel) incomplete LU factorization with thresholding - * + * * Reference: * Edmond Chow and Aftab Patel * Fine-Grained Parallel Incomplete LU Factorization * SIAM J. Sci. Comp. - * + * * A: symmetric, half-stored (lower triangular), CSR format * droptol: row-wise tolerances used for dropping * sweeps: number of loops over non-zeros for computation * L / U: factorized triangular matrices (A \approx LU), CSR format */ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, - const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) + const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; real *D, *D_inv, sum, start; @@ -1170,7 +1174,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, exit( INSUFFICIENT_MEMORY ); } - if( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || + if ( ( D = (real*) malloc(A->n * sizeof(real)) ) == NULL || ( D_inv = (real*) malloc(A->n * sizeof(real)) ) == NULL ) { fprintf( stderr, "not enough memory for ILUT_PAR preconditioning matrices. terminating.\n" ); @@ -1178,7 +1182,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, } #pragma omp parallel for schedule(guided) \ - default(none) shared(D, D_inv) private(i) + default(none) shared(D, D_inv) private(i) for ( i = 0; i < A->n; ++i ) { D_inv[i] = SQRT( A->val[A->start[i + 1] - 1] ); @@ -1189,7 +1193,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, * transformation DAD, where D = D(1./sqrt(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, D) private(i, pj) + default(none) shared(DAD, D) private(i, pj) for ( i = 0; i < A->n; ++i ) { /* non-diagonals */ @@ -1215,7 +1219,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, /* L has unit diagonal, by convention */ #pragma omp parallel for schedule(guided) \ - default(none) private(i) shared(L_temp) + default(none) private(i) shared(L_temp) for ( i = 0; i < A->n; ++i ) { L_temp->val[L_temp->start[i + 1] - 1] = 1.0; @@ -1225,7 +1229,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, { /* for each nonzero in L */ #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) for ( j = 0; j < DAD->start[DAD->n]; ++j ) { sum = ZERO; @@ -1268,14 +1272,14 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, } } - if( j != ei_x - 1 ) + if ( j != ei_x - 1 ) { L_temp->val[j] = ( DAD->val[j] - sum ) / U_temp->val[ei_y - 1]; } } #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) + default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) for ( j = 0; j < DAD->start[DAD->n]; ++j ) { sum = ZERO; @@ -1326,7 +1330,7 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, * since DAD \approx LU, then * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ #pragma omp parallel for schedule(guided) \ - default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) + default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) for ( i = 0; i < DAD->n; ++i ) { for ( pj = DAD->start[i]; pj < DAD->start[i + 1]; ++pj ) @@ -1397,11 +1401,11 @@ static real ILUT_PAR( const sparse_matrix * const A, const real * droptol, static void Init_MatVec( const reax_system * const system, const control_params * const control, - simulation_data * const data, static_storage * const workspace, - const list * const far_nbrs ) + simulation_data * const data, static_storage * const workspace, + const list * const far_nbrs ) { int i, fillin; - real s_tmp, t_tmp; + real s_tmp, t_tmp, time; sparse_matrix *H_test, *L_test, *U_test; // char fname[100]; @@ -1409,63 +1413,69 @@ static void Init_MatVec( const reax_system * const system, const control_params ((data->step - data->prev_steps) % control->pre_comp_refactor == 0 || workspace->L == NULL)) { //Print_Linear_System( system, control, workspace, data->step ); - Sort_Matrix_Rows( workspace->H ); - Sort_Matrix_Rows( workspace->H_sp ); + + time = Get_Time( ); + if ( control->pre_comp_type != DIAG_PC ) + { + Sort_Matrix_Rows( workspace->H ); + Sort_Matrix_Rows( workspace->H_sp ); + } + data->timing.QEq_sort_mat_rows += Get_Timing_Info( time ); #if defined(DEBUG) fprintf( stderr, "H matrix sorted\n" ); #endif - switch ( control->pre_comp_type ) - { - case DIAG_PC: - if ( workspace->Hdia_inv == NULL ) + switch ( control->pre_comp_type ) + { + case DIAG_PC: + if ( workspace->Hdia_inv == NULL ) + { + if ( ( workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ) ) == NULL ) { - if ( ( workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ) ) == NULL ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } - data->timing.pre_comp += diag_pre_comp( system, workspace->Hdia_inv ); - break; - case ICHOLT_PC: - Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol ); + } + data->timing.pre_comp += diag_pre_comp( system, workspace->Hdia_inv ); + break; + case ICHOLT_PC: + Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); + fprintf( stderr, "drop tolerances calculated\n" ); #endif - if ( workspace->L == NULL ) + if ( workspace->L == NULL ) + { + fillin = Estimate_LU_Fill( workspace->H, workspace->droptol ); + if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || + Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ) { - fillin = Estimate_LU_Fill( workspace->H, workspace->droptol ); - if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || - Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } #if defined(DEBUG) - fprintf( stderr, "fillin = %d\n", fillin ); - fprintf( stderr, "allocated memory: L = U = %ldMB\n", - fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) ); + fprintf( stderr, "fillin = %d\n", fillin ); + fprintf( stderr, "allocated memory: L = U = %ldMB\n", + fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) ); #endif - } + } - data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U ); + data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U ); // data->timing.pre_comp += ICHOLT( workspace->H_sp, workspace->droptol, workspace->L, workspace->U ); - break; - case ILU_PAR_PC: - if ( workspace->L == NULL ) + break; + case ILU_PAR_PC: + if ( workspace->L == NULL ) + { + /* factors have sparsity pattern as H */ + if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || + Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) { - /* factors have sparsity pattern as H */ - if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || - Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } + } // data->timing.pre_comp += ICHOL_PAR( workspace->H, control->pre_comp_sweeps, workspace->L, workspace->U ); - data->timing.pre_comp += ILU_PAR( workspace->H, control->pre_comp_sweeps, workspace->L, workspace->U ); + data->timing.pre_comp += ILU_PAR( workspace->H, control->pre_comp_sweeps, workspace->L, workspace->U ); // Print_Sparse_Matrix2( workspace->H, "H.out" ); // Print_Sparse_Matrix2( workspace->L, "L.out" ); @@ -1496,7 +1506,7 @@ static void Init_MatVec( const reax_system * const system, const control_params // H_test->entries[4].val = -43.; // H_test->entries[5].j = 2; // H_test->entries[5].val = 98.; -// +// //// data->timing.pre_comp += ICHOLT( H_test, workspace->droptol, L_test, U_test ); //// Print_Sparse_Matrix2( L_test, "L_ICHOLT.out" ); //// Print_Sparse_Matrix2( U_test, "U_ICHOLT.out" ); @@ -1505,50 +1515,50 @@ static void Init_MatVec( const reax_system * const system, const control_params // Print_Sparse_Matrix2( L_test, "L_SLU.out" ); // Print_Sparse_Matrix2( U_test, "U_SLU.out" ); // exit( 0 ); - break; - case ILUT_PAR_PC: - Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol ); + break; + case ILUT_PAR_PC: + Calculate_Droptol( workspace->H, workspace->droptol, control->pre_comp_droptol ); #if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); + fprintf( stderr, "drop tolerances calculated\n" ); #endif - if ( workspace->L == NULL ) + if ( workspace->L == NULL ) + { + /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */ + if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || + Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) { - /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */ - if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || - Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } + } - data->timing.pre_comp += ILUT_PAR( workspace->H, workspace->droptol, control->pre_comp_sweeps, - workspace->L, workspace->U ); - break; - case ILU_SUPERLU_MT_PC: - if ( workspace->L == NULL ) + data->timing.pre_comp += ILUT_PAR( workspace->H, workspace->droptol, control->pre_comp_sweeps, + workspace->L, workspace->U ); + break; + case ILU_SUPERLU_MT_PC: + if ( workspace->L == NULL ) + { + /* factors have sparsity pattern as H */ + if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || + Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) { - /* factors have sparsity pattern as H */ - if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || - Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) - { - fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } + fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); } + } #if defined(HAVE_SUPERLU_MT) - data->timing.pre_comp += SuperLU_Factorize( workspace->H, workspace->L, workspace->U ); + data->timing.pre_comp += SuperLU_Factorize( workspace->H, workspace->L, workspace->U ); #else - fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" ); - exit( INVALID_INPUT ); + fprintf( stderr, "SuperLU MT support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); #endif - break; - default: - fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - } + break; + default: + fprintf( stderr, "Unrecognized preconditioner computation method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } #if defined(DEBUG) fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); @@ -1631,7 +1641,7 @@ void QEq( reax_system * const system, control_params * const control, simulation static_storage * const workspace, const list * const far_nbrs, const output_controls * const out_control ) { - int matvecs; + int iters; Init_MatVec( system, control, data, workspace, far_nbrs ); @@ -1640,51 +1650,49 @@ void QEq( reax_system * const system, control_params * const control, simulation switch ( control->qeq_solver_type ) { - case GMRES_S: - matvecs = GMRES( workspace, control, workspace->H, workspace->b_s, control->qeq_solver_q_err, - workspace->s[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv), - (data->step - data->prev_steps) % control->pre_comp_refactor == 0 ); - matvecs += GMRES( workspace, control, workspace->H, workspace->b_t, control->qeq_solver_q_err, - workspace->t[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv), 0 ); - break; - case GMRES_H_S: - matvecs = GMRES_HouseHolder( workspace, control, workspace->H, workspace->b_s, control->qeq_solver_q_err, - workspace->s[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv), - (data->step - data->prev_steps) % control->pre_comp_refactor == 0 ); - matvecs += GMRES_HouseHolder( workspace, control, workspace->H, workspace->b_t, control->qeq_solver_q_err, - workspace->t[0], out_control->log, &(data->timing.pre_app), &(data->timing.spmv), 0 ); - break; - case CG_S: - matvecs = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, + case GMRES_S: + iters = GMRES( workspace, control, data, workspace->H, workspace->b_s, control->qeq_solver_q_err, + workspace->s[0], out_control->log, (data->step - data->prev_steps) % control->pre_comp_refactor == 0 ); + iters += GMRES( workspace, control, data, workspace->H, workspace->b_t, control->qeq_solver_q_err, + workspace->t[0], out_control->log, 0 ); + break; + case GMRES_H_S: + iters = GMRES_HouseHolder( workspace, control, data, workspace->H, workspace->b_s, control->qeq_solver_q_err, + workspace->s[0], out_control->log, (data->step - data->prev_steps) % control->pre_comp_refactor == 0 ); + iters += GMRES_HouseHolder( workspace, control, data, workspace->H, workspace->b_t, control->qeq_solver_q_err, + workspace->t[0], out_control->log, 0 ); + break; + case CG_S: + iters = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, workspace->s[0], out_control->log ) + 1; - matvecs += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, - workspace->t[0], out_control->log ) + 1; -// matvecs = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, + iters += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, + workspace->t[0], out_control->log ) + 1; +// iters = CG( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, // workspace->L, workspace->U, workspace->s[0], control->pre_app_type, -// control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1; -// matvecs += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, +// control->pre_app_jacobi_iters, out_control->log ) + 1; +// iters += CG( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, // workspace->L, workspace->U, workspace->t[0], control->pre_app_type, -// control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1; - break; - case SDM_S: - matvecs = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, - workspace->s[0], out_control->log ) + 1; - matvecs += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, - workspace->t[0], out_control->log ) + 1; -// matvecs = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, +// control->pre_app_jacobi_iters, out_control->log ) + 1; + break; + case SDM_S: + iters = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, + workspace->s[0], out_control->log ) + 1; + iters += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, + workspace->t[0], out_control->log ) + 1; +// iters = SDM( workspace, workspace->H, workspace->b_s, control->qeq_solver_q_err, // workspace->L, workspace->U, workspace->s[0], control->pre_app_type, -// control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1; -// matvecs += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, +// control->pre_app_jacobi_iters, out_control->log ) + 1; +// iters += SDM( workspace, workspace->H, workspace->b_t, control->qeq_solver_q_err, // workspace->L, workspace->U, workspace->t[0], control->pre_app_type, -// control->pre_app_jacobi_iters, out_control->log, &(data->timing.pre_app), &(data->timing.spmv) ) + 1; - break; - default: - fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); - exit( INVALID_INPUT ); - break; +// control->pre_app_jacobi_iters, out_control->log ) + 1; + break; + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; } - data->timing.matvecs += matvecs; + data->timing.solver_iters += iters; #if defined(DEBUG_FOCUS) fprintf( stderr, "linsolve-" ); #endif diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c index 6b768a74a40ddd23f36de24d27a6182bc5609c9b..9d2aea4cf107a48d00113ebe1383f615f637f75a 100644 --- a/sPuReMD/src/init_md.c +++ b/sPuReMD/src/init_md.c @@ -209,11 +209,15 @@ void Init_Simulation_Data( reax_system *system, control_params *control, data->timing.init_forces = 0; data->timing.bonded = 0; data->timing.nonb = 0; - data->timing.QEq = 0; - data->timing.matvecs = 0; + data->timing.QEq = ZERO; + data->timing.QEq_sort_mat_rows = ZERO; data->timing.pre_comp = ZERO; data->timing.pre_app = ZERO; - data->timing.spmv = ZERO; + data->timing.solver_iters = 0; + data->timing.solver_spmv = ZERO; + data->timing.solver_vector_ops = ZERO; + data->timing.solver_orthog = ZERO; + data->timing.solver_tri_solve = ZERO; } @@ -534,9 +538,10 @@ void Init_Out_Controls(reax_system *system, control_params *control, strcpy( temp, control->sim_name ); strcat( temp, ".log" ); out_control->log = fopen( temp, "w" ); - fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n", + fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n", "step", "total", "neighbors", "init", "bonded", - "nonbonded", "QEq", "matvec", "pre comp", "pre app", "spmv" ); + "nonbonded", "QEq", "QEq Sort", "S iters", "Pre Comp", "Pre App", + "S spmv", "S vec ops", "S orthog", "S tsolve" ); } /* Init pressure file */ diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index ff89296ff426a823c8aeceb78086bea0dda4d0bb..b225c508a54f14d86c59a64122f52c2f77babbda 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -578,10 +578,14 @@ typedef struct real bonded; real nonb; real QEq; - int matvecs; + real QEq_sort_mat_rows; real pre_comp; real pre_app; - real spmv; + int solver_iters; + real solver_spmv; + real solver_vector_ops; + real solver_orthog; + real solver_tri_solve; } reax_timing; diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c index 7ba057f8937f957b01998b11a1e0cb7072dab964..647b3701a391e75f8623a08fcfe1e6359e86f3ce 100644 --- a/sPuReMD/src/print_utils.c +++ b/sPuReMD/src/print_utils.c @@ -103,7 +103,6 @@ void Print_Bond_Orders( reax_system *system, control_params *control, } - void Print_Bond_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, output_controls *out_control ) @@ -120,7 +119,6 @@ void Print_Bond_Forces( reax_system *system, control_params *control, } - void Print_LonePair_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, output_controls *out_control ) @@ -139,7 +137,6 @@ void Print_LonePair_Forces( reax_system *system, control_params *control, } - void Print_OverUnderCoor_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, @@ -176,7 +173,6 @@ void Print_OverUnderCoor_Forces( reax_system *system, control_params *control, } - void Print_Three_Body_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, output_controls *out_control ) @@ -232,7 +228,6 @@ void Print_Three_Body_Forces( reax_system *system, control_params *control, } - void Print_Hydrogen_Bond_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, @@ -252,7 +247,6 @@ void Print_Hydrogen_Bond_Forces( reax_system *system, control_params *control, } - void Print_Four_Body_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, output_controls *out_control ) @@ -287,7 +281,6 @@ void Print_Four_Body_Forces( reax_system *system, control_params *control, } - void Print_vdW_Coulomb_Forces( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists, output_controls *out_control ) @@ -410,6 +403,7 @@ void Print_Near_Neighbors( reax_system *system, control_params *control, fclose( fout ); } + /* near nbrs contain both i-j, j-i nbrhood info */ void Print_Near_Neighbors2( reax_system *system, control_params *control, static_storage *workspace, list **lists ) @@ -444,6 +438,7 @@ void Print_Near_Neighbors2( reax_system *system, control_params *control, fclose( fout ); } + /* far nbrs contain only i-j nbrhood info, no j-i. */ void Print_Far_Neighbors( reax_system *system, control_params *control, static_storage *workspace, list **lists ) @@ -483,11 +478,13 @@ void Print_Far_Neighbors( reax_system *system, control_params *control, fclose( fout ); } + int fn_qsort_intcmp( const void *a, const void *b ) { return ( *(int *)a - * (int *)b); } + void Print_Far_Neighbors2( reax_system *system, control_params *control, static_storage *workspace, list **lists ) { @@ -619,28 +616,36 @@ void Output_Results( reax_system *system, control_params *control, f_update = 1; else f_update = out_control->energy_update_freq; - fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.6f%10.6f%10.6f\n", + fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.6f%10.6f%10.2f%10.6f%10.6f%10.6f%10.6f%10.6f%10.6f\n", data->step, t_elapsed / f_update, data->timing.nbrs / f_update, data->timing.init_forces / f_update, data->timing.bonded / f_update, data->timing.nonb / f_update, data->timing.QEq / f_update, - (double)data->timing.matvecs / f_update, + data->timing.QEq_sort_mat_rows / f_update, + (double)data->timing.solver_iters / f_update, data->timing.pre_comp / f_update, data->timing.pre_app / f_update, - data->timing.spmv / f_update ); + data->timing.solver_spmv / f_update, + data->timing.solver_vector_ops / f_update, + data->timing.solver_orthog / f_update, + data->timing.solver_tri_solve / f_update ); data->timing.total = Get_Time( ); data->timing.nbrs = 0; data->timing.init_forces = 0; data->timing.bonded = 0; data->timing.nonb = 0; - data->timing.QEq = 0; - data->timing.matvecs = 0; + data->timing.QEq = ZERO; + data->timing.QEq_sort_mat_rows = ZERO; data->timing.pre_comp = ZERO; data->timing.pre_app = ZERO; - data->timing.spmv = ZERO; + data->timing.solver_iters = 0; + data->timing.solver_spmv = ZERO; + data->timing.solver_vector_ops = ZERO; + data->timing.solver_orthog = ZERO; + data->timing.solver_tri_solve = ZERO; fflush( out_control->out ); fflush( out_control->pot ); @@ -775,7 +780,6 @@ void Print_Linear_System( reax_system *system, control_params *control, } - void Print_Charges( reax_system *system, control_params *control, static_storage *workspace, int step ) { @@ -795,7 +799,6 @@ void Print_Charges( reax_system *system, control_params *control, } - void Print_Soln( static_storage *workspace, real *x, real *b_prm, real *b, int N ) {