diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c index 7a3900c0c8d34361776062e5fb62624ff8796c08..61c5c8a702cacfc14123761756e81ab8717227aa 100644 --- a/sPuReMD/src/charges.c +++ b/sPuReMD/src/charges.c @@ -205,7 +205,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, case SAI_PC: #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) data->timing.cm_solver_pre_comp += - Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv ); #else fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); @@ -564,7 +564,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system, case SAI_PC: #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) data->timing.cm_solver_pre_comp += - Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv ); #else fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); @@ -682,7 +682,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, case SAI_PC: #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) data->timing.cm_solver_pre_comp += - Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv ); #else fprintf( stderr, "LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); @@ -854,8 +854,9 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, break; case SAI_PC: - Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, - &workspace->H_spar_patt ); + setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, + &workspace->H_spar_patt_full, &workspace->H_app_inv, + control->cm_solver_pre_comp_sai_thres ); break; default: @@ -1003,8 +1004,9 @@ static void Setup_Preconditioner_EE( const reax_system * const system, break; case SAI_PC: - Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, - &workspace->H_spar_patt ); + setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, + &workspace->H_spar_patt_full, &workspace->H_app_inv, + control->cm_solver_pre_comp_sai_thres ); break; default: @@ -1154,8 +1156,9 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, break; case SAI_PC: - Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, - &workspace->H_spar_patt ); + setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, + &workspace->H_spar_patt_full, &workspace->H_app_inv, + control->cm_solver_pre_comp_sai_thres ); break; default: diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c index 3e814309302ed35c32290995854a8c19f9d460dd..93fed761a7f4233da33ee03fefce7e3320947c14 100644 --- a/sPuReMD/src/init_md.c +++ b/sPuReMD/src/init_md.c @@ -1081,7 +1081,9 @@ void Finalize_Workspace( reax_system *system, control_params *control, } if ( control->cm_solver_pre_comp_type == SAI_PC ) { + Deallocate_Matrix( workspace->H_full ); Deallocate_Matrix( workspace->H_spar_patt ); + Deallocate_Matrix( workspace->H_spar_patt_full ); Deallocate_Matrix( workspace->H_app_inv ); } diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 32ec674117d8cfb2b4d6c868f20d198683f5cc42..110c43a3f747ac15e20bb0dc7958a9354963dbfd 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -149,7 +149,7 @@ void Sort_Matrix_Rows( sparse_matrix * const A ) #endif { temp = (sparse_matrix_entry *) smalloc( A->n * sizeof(sparse_matrix_entry), - "Sort_Matrix_Rows::temp" ); + "Sort_Matrix_Rows::temp" ); /* sort each row of A using column indices */ #ifdef _OPENMP @@ -256,8 +256,9 @@ static void compute_full_sparse_matrix( const sparse_matrix * const A, * Assumptions: * A has non-zero diagonals * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ -void Setup_Sparsity_Pattern( const sparse_matrix * const A, - const real filter, sparse_matrix ** A_spar_patt ) +void setup_sparse_approx_inverse( const sparse_matrix * const A, sparse_matrix ** A_full, + sparse_matrix ** A_spar_patt, sparse_matrix **A_spar_patt_full, + sparse_matrix ** A_app_inv, const real filter ) { int i, pj, size; real min, max, threshold, val; @@ -283,7 +284,7 @@ void Setup_Sparsity_Pattern( const sparse_matrix * const A, } } - // find min and max element of the matrix + /* find min and max elements of matrix */ for ( i = 0; i < A->n; ++i ) { for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) @@ -309,25 +310,8 @@ void Setup_Sparsity_Pattern( const sparse_matrix * const A, } threshold = min + (max - min) * (1.0 - filter); - // calculate the nnz of the sparsity pattern - // for ( size = 0, i = 0; i < A->n; ++i ) - // { - // for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) - // { - // if ( threshold <= A->val[pj] ) - // size++; - // } - // } - // - // if ( Allocate_Matrix( A_spar_patt, A->n, size ) == NULL ) - // { - // fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); - // exit( INSUFFICIENT_MEMORY ); - // } - - //(*A_spar_patt)->start[(*A_spar_patt)->n] = size; - // fill the sparsity pattern + /* fill sparsity pattern */ for ( size = 0, i = 0; i < A->n; ++i ) { (*A_spar_patt)->start[i] = size; @@ -343,6 +327,17 @@ void Setup_Sparsity_Pattern( const sparse_matrix * const A, } } (*A_spar_patt)->start[A->n] = size; + + compute_full_sparse_matrix( A, A_full ); + compute_full_sparse_matrix( *A_spar_patt, A_spar_patt_full ); + + /* A_app_inv has the same sparsity pattern + * as A_spar_patt_full (omit non-zero values) */ + if ( Allocate_Matrix( A_app_inv, (*A_spar_patt_full)->n, (*A_spar_patt_full)->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } } void Calculate_Droptol( const sparse_matrix * const A, @@ -369,7 +364,7 @@ void Calculate_Droptol( const sparse_matrix * const A, if ( droptol_local == NULL ) { droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real), - "Calculate_Droptol::droptol_local" ); + "Calculate_Droptol::droptol_local" ); } } @@ -564,15 +559,15 @@ real SuperLU_Factorize( const sparse_matrix * const A, SUPERLU_ABORT("Malloc fails for part_super__h[]."); } a = (real*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(real), - "SuperLU_Factorize::a" ); + "SuperLU_Factorize::a" ); asub = (int_t*) smalloc( (2 * A->start[A->n] - A->n) * sizeof(int_t), - "SuperLU_Factorize::asub" ); + "SuperLU_Factorize::asub" ); xa = (int_t*) smalloc( (A->n + 1) * sizeof(int_t), - "SuperLU_Factorize::xa" ); + "SuperLU_Factorize::xa" ); Ltop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int), - "SuperLU_Factorize::Ltop" ); + "SuperLU_Factorize::Ltop" ); Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int), - "SuperLU_Factorize::Utop" ); + "SuperLU_Factorize::Utop" ); if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) { fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); @@ -820,11 +815,11 @@ real ICHOLT( const sparse_matrix * const A, const real * const droptol, start = Get_Time( ); Utop = (unsigned int*) smalloc( (A->n + 1) * sizeof(unsigned int), - "ICHOLT::Utop" ); + "ICHOLT::Utop" ); tmp_j = (int*) smalloc( A->n * sizeof(int), - "ICHOLT::Utop" ); + "ICHOLT::Utop" ); tmp_val = (real*) smalloc( A->n * sizeof(real), - "ICHOLT::Utop" ); + "ICHOLT::Utop" ); // clear variables Ltop = 0; @@ -1606,7 +1601,13 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) -real Sparse_Approx_Inverse( const sparse_matrix * const A, +/* Compute M^{1} \approx A which minimizes + * + * A: symmetric, sparse matrix, stored in full CSR format + * A_spar_patt: sparse matrix used as template sparsity pattern + * for approximating the inverse, stored in full CSR format + * A_app_inv: approximate inverse to A, stored in full CSR format (result) */ +real sparse_approx_inverse( const sparse_matrix * const A, const sparse_matrix * const A_spar_patt, sparse_matrix ** A_app_inv ) { @@ -1616,28 +1617,17 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A, int *pos_x, *pos_y; real start; real *e_j, *dense_matrix; - sparse_matrix *A_full = NULL, *A_spar_patt_full = NULL; char *X, *Y; start = Get_Time( ); - // Get A and A_spar_patt full matrices - compute_full_sparse_matrix( A, &A_full ); - compute_full_sparse_matrix( A_spar_patt, &A_spar_patt_full ); - - // A_app_inv will be the same as A_spar_patt_full except the val array - if ( Allocate_Matrix( A_app_inv, A_spar_patt_full->n, A_spar_patt_full->m ) == FAILURE ) - { - fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt_full->start[A_spar_patt_full->n]; + (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n]; #ifdef _OPENMP #pragma omp parallel default(none) \ - shared(A_full, A_spar_patt_full) private(i, k, pj, j_temp, identity_pos, \ - N, M, d_i, d_j, m, n, nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) + private(i, k, pj, j_temp, identity_pos, N, M, d_i, d_j, m, n, \ + nrhs, lda, ldb, info, X, Y, pos_x, pos_y, e_j, dense_matrix) \ + shared(A_app_inv, stderr) #endif { X = (char *) smalloc( sizeof(char) * A->n, "Sparse_Approx_Inverse::X" ); @@ -1656,16 +1646,16 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A, #ifdef _OPENMP #pragma omp for schedule(static) #endif - for ( i = 0; i < A_spar_patt_full->n; ++i ) + for ( i = 0; i < A_spar_patt->n; ++i ) { N = 0; M = 0; // find column indices of nonzeros (which will be the columns indices of the dense matrix) - for ( pj = A_spar_patt_full->start[i]; pj < A_spar_patt_full->start[i + 1]; ++pj ) + for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->start[i + 1]; ++pj ) { - j_temp = A_spar_patt_full->j[pj]; + j_temp = A_spar_patt->j[pj]; Y[j_temp] = 1; pos_y[j_temp] = N; @@ -1673,16 +1663,16 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A, // for each of those indices // search through the row of full A of that index - for ( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k ) + for ( k = A->start[j_temp]; k < A->start[j_temp + 1]; ++k ) { // and accumulate the nonzero column indices to serve as the row indices of the dense matrix - X[A_full->j[k]] = 1; + X[A->j[k]] = 1; } } // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix identity_pos = M; - for ( k = 0; k < A_full->n; k++) + for ( k = 0; k < A->n; k++) { if ( X[k] != 0 ) { @@ -1708,12 +1698,12 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A, dense_matrix[d_i * N + d_j] = 0.0; } // change the value if any of the column indices is seen - for ( d_j = A_full->start[pos_x[d_i]]; - d_j < A_full->start[pos_x[d_i + 1]]; ++d_j ) + for ( d_j = A->start[pos_x[d_i]]; + d_j < A->start[pos_x[d_i + 1]]; ++d_j ) { - if ( Y[A_full->j[d_j]] == 1 ) + if ( Y[A->j[d_j]] == 1 ) { - dense_matrix[d_i * N + pos_y[A_full->j[d_j]]] = A_full->val[d_j]; + dense_matrix[d_i * N + pos_y[A->j[d_j]]] = A->val[d_j]; } } @@ -1753,11 +1743,11 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A, // print_matrix( "Least squares solution", n, nrhs, b, ldb ); // accumulate the resulting vector to build A_app_inv - (*A_app_inv)->start[i] = A_spar_patt_full->start[i]; - for ( k = A_spar_patt_full->start[i]; k < A_spar_patt_full->start[i + 1]; ++k) + (*A_app_inv)->start[i] = A_spar_patt->start[i]; + for ( k = A_spar_patt->start[i]; k < A_spar_patt->start[i + 1]; ++k) { - (*A_app_inv)->j[k] = A_spar_patt_full->j[k]; - (*A_app_inv)->val[k] = e_j[k - A_spar_patt_full->start[i]]; + (*A_app_inv)->j[k] = A_spar_patt->j[k]; + (*A_app_inv)->val[k] = e_j[k - A_spar_patt->start[i]]; } //empty variables that will be used next iteration @@ -1778,15 +1768,12 @@ real Sparse_Approx_Inverse( const sparse_matrix * const A, sfree( X, "Sparse_Approx_Inverse::X" ); } - Deallocate_Matrix( A_full ); - Deallocate_Matrix( A_spar_patt_full ); - return Get_Timing_Info( start ); } #endif -/* sparse matrix-vector product Ax=b +/* sparse matrix-vector product Ax = b * where: * A: lower triangular matrix, stored in CSR format * x: vector @@ -1813,7 +1800,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, if ( b_local == NULL ) { b_local = (real*) smalloc( omp_get_num_threads() * n * sizeof(real), - "Sparse_MatVec::b_local" ); + "Sparse_MatVec::b_local" ); } } @@ -1873,7 +1860,7 @@ static void Sparse_MatVec_full( const sparse_matrix * const A, Vector_MakeZero( b, A->n ); #ifdef _OPENMP - #pragma omp for schedule(static) default(none) private(i, j) + #pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -1895,7 +1882,7 @@ void Transpose( const sparse_matrix * const A, sparse_matrix * const A_t ) unsigned int i, j, pj, *A_t_top; A_t_top = (unsigned int*) scalloc( A->n + 1, sizeof(unsigned int), - "Transpose::A_t_top" ); + "Transpose::A_t_top" ); memset( A_t->start, 0, (A->n + 1) * sizeof(unsigned int) ); @@ -2073,17 +2060,17 @@ void tri_solve_level_sched( const sparse_matrix * const LU, if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL ) { row_levels = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int), - "tri_solve_level_sched::row_levels" ); + "tri_solve_level_sched::row_levels" ); level_rows = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int), - "tri_solve_level_sched::level_rows" ); + "tri_solve_level_sched::level_rows" ); level_rows_cnt = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int), - "tri_solve_level_sched::level_rows_cnt" ); + "tri_solve_level_sched::level_rows_cnt" ); } if ( top == NULL ) { top = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int), - "tri_solve_level_sched::top" ); + "tri_solve_level_sched::top" ); } /* find levels (row dependencies in substitutions) */ @@ -2320,9 +2307,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } fb_color = (int*) smalloc( sizeof(int) * MAX_COLOR, - "graph_coloring::fb_color" ); + "graph_coloring::fb_color" ); conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n, - "graph_coloring::fb_color" ); + "graph_coloring::fb_color" ); #ifdef _OPENMP #pragma omp barrier @@ -2693,23 +2680,23 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */ color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n, - "setup_graph_coloring::color" ); + "setup_graph_coloring::color" ); to_color = (unsigned int*) smalloc( sizeof(unsigned int) * H->n, - "setup_graph_coloring::to_color" ); + "setup_graph_coloring::to_color" ); conflict = (unsigned int*) smalloc( sizeof(unsigned int) * H->n, - "setup_graph_coloring::conflict" ); + "setup_graph_coloring::conflict" ); conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (num_thread + 1), - "setup_graph_coloring::conflict_cnt" ); + "setup_graph_coloring::conflict_cnt" ); recolor = (unsigned int*) smalloc( sizeof(unsigned int) * H->n, - "setup_graph_coloring::recolor" ); + "setup_graph_coloring::recolor" ); color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (H->n + 1), - "setup_graph_coloring::color_top" ); + "setup_graph_coloring::color_top" ); permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * H->n, - "setup_graph_coloring::premuted_row_col" ); + "setup_graph_coloring::premuted_row_col" ); permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * H->n, - "setup_graph_coloring::premuted_row_col_inv" ); + "setup_graph_coloring::premuted_row_col_inv" ); y_p = (real*) smalloc( sizeof(real) * H->n, - "setup_graph_coloring::y_p" ); + "setup_graph_coloring::y_p" ); if ( (Allocate_Matrix( &H_p, H->n, H->m ) == FAILURE ) || (Allocate_Matrix( &H_full, H->n, 2 * H->m - H->n ) == FAILURE ) ) { @@ -2942,7 +2929,7 @@ static void apply_preconditioner( const static_storage * const workspace, const if ( Dinv_L == NULL ) { Dinv_L = (real*) smalloc( sizeof(real) * workspace->L->n, - "apply_preconditioner::Dinv_L" ); + "apply_preconditioner::Dinv_L" ); } } @@ -2968,7 +2955,7 @@ static void apply_preconditioner( const static_storage * const workspace, const if ( Dinv_U == NULL ) { Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n, - "apply_preconditioner::Dinv_U" ); + "apply_preconditioner::Dinv_U" ); } } @@ -3019,8 +3006,8 @@ int GMRES( const static_storage * const workspace, const control_params * const #ifdef _OPENMP #pragma omp parallel default(none) \ private(i, j, k, itr, bnorm, ret_temp, t_start) \ - reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops) \ - shared(N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) + shared(N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) \ + reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops) #endif { j = 0; @@ -3278,7 +3265,7 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( g_itr >= control->cm_solver_max_iters ) { - fprintf( stderr, "GMRES convergence failed\n" ); + fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr ); return g_itr * (control->cm_solver_restart + 1) + g_j + 1; } @@ -3291,191 +3278,226 @@ int GMRES_HouseHolder( const static_storage * const workspace, const sparse_matrix * const H, const real * const b, real tol, real * const x, const int fresh_pre ) { - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; + int i, j, k, itr, N, g_j, g_itr; + real cc, tmp1, tmp2, temp, bnorm, ret_temp; real v[10000], z[control->cm_solver_restart + 2][10000], w[control->cm_solver_restart + 2]; real u[control->cm_solver_restart + 2][10000]; + real t_start, t_ortho, t_pa, t_spmv, t_ts, t_vops; j = 0; N = H->n; - bnorm = Norm( b, N ); - /* apply the diagonal pre-conditioner to rhs */ - for ( i = 0; i < N; ++i ) +#ifdef _OPENMP + #pragma omp parallel default(none) \ + private(i, j, k, itr, bnorm, ret_temp, t_start) \ + shared(v, z, w, u, tol, N, cc, tmp1, tmp2, temp, g_itr, g_j, stderr) \ + reduction(+: t_ortho, t_pa, t_spmv, t_ts, t_vops) +#endif { - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; - } + t_start = Get_Time( ); + bnorm = Norm( b, N ); + t_vops += Get_Timing_Info( t_start ); - // memset( x, 0, sizeof(real) * N ); + // memset( x, 0, sizeof(real) * N ); - /* GMRES outer-loop */ - for ( itr = 0; itr < control->cm_solver_max_iters; ++itr ) - { - /* compute z = r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - for ( i = 0; i < N; ++i ) + /* GMRES outer-loop */ + for ( itr = 0; itr < control->cm_solver_max_iters; ++itr ) { - workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ - } - Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); + /* compute z = r0 */ + t_start = Get_Time( ); + Sparse_MatVec( H, x, workspace->b_prm ); + t_spmv += Get_Timing_Info( t_start ); - Vector_MakeZero( w, control->cm_solver_restart + 1 ); - w[0] = Norm( z[0], N ); + t_start = Get_Time( ); + Vector_Sum( workspace->b_prc, 1., workspace->b, -1., workspace->b_prm, N ); + t_vops += Get_Timing_Info( t_start ); - Vector_Copy( u[0], z[0], N ); - u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; - Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); + t_start = Get_Time( ); + apply_preconditioner( workspace, control, workspace->b_prc, z[0], fresh_pre ); + t_pa += Get_Timing_Info( t_start ); - w[0] *= ( u[0][0] < 0.0 ? 1 : -1 ); - // fprintf( stderr, "\n\n%12.6f\n", w[0] ); + t_start = Get_Time( ); + Vector_MakeZero( w, control->cm_solver_restart + 1 ); + w[0] = Norm( z[0], N ); - /* GMRES inner-loop */ - for ( j = 0; j < control->cm_solver_restart && FABS( w[j] ) / bnorm > tol; j++ ) - { - /* compute v_j */ - Vector_Scale( z[j], -2 * u[j][j], u[j], N ); - z[j][j] += 1.; /* due to e_j */ + Vector_Copy( u[0], z[0], N ); + u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; + Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); - for ( i = j - 1; i >= 0; --i ) + w[0] *= ( u[0][0] < 0.0 ? 1 : -1 ); + t_vops += Get_Timing_Info( t_start ); + + /* GMRES inner-loop */ + for ( j = 0; j < control->cm_solver_restart && FABS( w[j] ) / bnorm > tol; j++ ) { - Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i ); - } + /* compute v_j */ + t_start = Get_Time( ); + Vector_Scale( z[j], -2 * u[j][j], u[j], N ); + z[j][j] += 1.; /* due to e_j */ - /* matvec */ - Sparse_MatVec( H, z[j], v ); + for ( i = j - 1; i >= 0; --i ) + { + Vector_Add( z[j] + i, -2 * Dot( u[i] + i, z[j] + i, N - i ), u[i] + i, N - i ); + } + t_vops += Get_Timing_Info( t_start ); - for ( k = 0; k < N; ++k ) - { - v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */ - } + /* matvec */ + t_start = Get_Time( ); + Sparse_MatVec( H, z[j], workspace->b_prc ); + t_spmv += Get_Timing_Info( t_start ); - for ( i = 0; i <= j; ++i ) - { - Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i ); - } + t_start = Get_Time( ); + apply_preconditioner( workspace, control, workspace->b_prc, v, fresh_pre ); + t_pa += Get_Timing_Info( t_start ); - if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) ) - { - /* compute the HouseHolder unit vector u_j+1 */ + t_start = Get_Time( ); for ( i = 0; i <= j; ++i ) { - u[j + 1][i] = 0; + Vector_Add( v + i, -2 * Dot( u[i] + i, v + i, N - i ), u[i] + i, N - i ); } - Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ); + if ( !Vector_isZero( v + (j + 1), N - (j + 1) ) ) + { + /* compute the HouseHolder unit vector u_j+1 */ + Vector_MakeZero( u[j + 1], j + 1 ); + Vector_Copy( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ); + ret_temp = Norm( v + (j + 1), N - (j + 1) ); +#ifdef _OPENMP + #pragma omp single +#endif + u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * ret_temp; - u[j + 1][j + 1] += ( v[j + 1] < 0.0 ? -1 : 1 ) * Norm( v + (j + 1), N - (j + 1) ); +#ifdef _OPENMP + #pragma omp barrier +#endif - Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N ); + Vector_Scale( u[j + 1], 1 / Norm( u[j + 1], N ), u[j + 1], N ); - /* overwrite v with P_m+1 * v */ - v[j + 1] -= 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1]; - Vector_MakeZero( v + (j + 2), N - (j + 2) ); - // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N ); - } + /* overwrite v with P_m+1 * v */ + ret_temp = 2 * Dot( u[j + 1] + (j + 1), v + (j + 1), N - (j + 1) ) * u[j + 1][j + 1]; +#ifdef _OPENMP + #pragma omp single +#endif + v[j + 1] -= ret_temp; +#ifdef _OPENMP + #pragma omp barrier +#endif - /* prev Givens rots on the upper-Hessenberg matrix to make it U */ - for ( i = 0; i < j; i++ ) - { - tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1]; - tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1]; + Vector_MakeZero( v + (j + 2), N - (j + 2) ); +// Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N ); + } + t_vops += Get_Timing_Info( t_start ); - v[i] = tmp1; - v[i + 1] = tmp2; - } + /* prev Givens rots on the upper-Hessenberg matrix to make it U */ + t_start = Get_Time( ); +#ifdef _OPENMP + #pragma omp single +#endif + { + for ( i = 0; i < j; i++ ) + { + tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i + 1]; + tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i + 1]; - /* apply the new Givens rotation to H and right-hand side */ - if ( FABS(v[j + 1]) >= ALMOST_ZERO ) - { - cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) ); - workspace->hc[j] = v[j] / cc; - workspace->hs[j] = v[j + 1] / cc; + v[i] = tmp1; + v[i + 1] = tmp2; + } - tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1]; - tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1]; + /* apply the new Givens rotation to H and right-hand side */ + if ( FABS(v[j + 1]) >= ALMOST_ZERO ) + { + cc = SQRT( SQR( v[j] ) + SQR( v[j + 1] ) ); + workspace->hc[j] = v[j] / cc; + workspace->hs[j] = v[j + 1] / cc; - v[j] = tmp1; - v[j + 1] = tmp2; + tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j + 1]; + tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j + 1]; - /* Givens rotations to rhs */ - tmp1 = workspace->hc[j] * w[j]; - tmp2 = -workspace->hs[j] * w[j]; - w[j] = tmp1; - w[j + 1] = tmp2; - } + v[j] = tmp1; + v[j + 1] = tmp2; - /* extend R */ - for ( i = 0; i <= j; ++i ) - { - workspace->h[i][j] = v[i]; + /* Givens rotations to rhs */ + tmp1 = workspace->hc[j] * w[j]; + tmp2 = -workspace->hs[j] * w[j]; + w[j] = tmp1; + w[j + 1] = tmp2; + } + + /* extend R */ + for ( i = 0; i <= j; ++i ) + { + workspace->h[i][j] = v[i]; + } + } + t_ortho += Get_Timing_Info( t_start ); } - // fprintf( stderr, "h:" ); - // for( i = 0; i <= j+1 ; ++i ) - // fprintf( stderr, "%.6f ", h[i][j] ); - // fprintf( stderr, "\n" ); - // fprintf( stderr, "%12.6f\n", w[j+1] ); - } + /* solve Hy = w. + H is now upper-triangular, do back-substitution */ + t_start = Get_Time( ); +#ifdef _OPENMP + #pragma omp single +#endif + { + for ( i = j - 1; i >= 0; i-- ) + { + temp = w[i]; + for ( k = j - 1; k > i; k-- ) + { + temp -= workspace->h[i][k] * workspace->y[k]; + } + workspace->y[i] = temp / workspace->h[i][i]; + } + } + t_ts += Get_Timing_Info( t_start ); - /* solve Hy = w. - H is now upper-triangular, do back-substitution */ - for ( i = j - 1; i >= 0; i-- ) - { - temp = w[i]; - for ( k = j - 1; k > i; k-- ) + t_start = Get_Time( ); + for ( i = j - 1; i >= 0; i-- ) { - temp -= workspace->h[i][k] * workspace->y[k]; + Vector_Add( x, workspace->y[i], z[i], N ); } + t_vops += Get_Timing_Info( t_start ); - workspace->y[i] = temp / workspace->h[i][i]; - } - - // fprintf( stderr, "y: " ); - // for( i = 0; i < control->cm_solver_restart+1; ++i ) - // fprintf( stderr, "%8.3f ", workspace->y[i] ); - - - /* update x = x_0 + Vy */ - // memset( z, 0, sizeof(real) * N ); - // for( i = j-1; i >= 0; i-- ) - // { - // Vector_Copy( v, z, N ); - // v[i] += workspace->y[i]; - // - // Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N ); - // } - // - // fprintf( stderr, "\nz: " ); - // for( k = 0; k < N; ++k ) - // fprintf( stderr, "%6.2f ", z[k] ); - - // fprintf( stderr, "\nx_bef: " ); - // for( i = 0; i < N; ++i ) - // fprintf( stderr, "%6.2f ", x[i] ); - - // Vector_Add( x, 1, z, N ); - for ( i = j - 1; i >= 0; i-- ) - { - Vector_Add( x, workspace->y[i], z[i], N ); + /* stopping condition */ + if ( FABS( w[j] ) / bnorm <= tol ) + { + break; + } } - /* stopping condition */ - if ( FABS( w[j] ) / bnorm <= tol ) +#ifdef _OPENMP + #pragma omp single +#endif { - break; + g_j = j; + g_itr = itr; } } - if ( itr >= control->cm_solver_max_iters ) +#ifdef _OPENMP + data->timing.cm_solver_orthog += t_ortho / control->num_threads; + data->timing.cm_solver_pre_app += t_pa / control->num_threads; + data->timing.cm_solver_spmv += t_spmv / control->num_threads; + data->timing.cm_solver_tri_solve += t_ts / control->num_threads; + data->timing.cm_solver_vector_ops += t_vops / control->num_threads; +#else + data->timing.cm_solver_orthog += t_ortho; + data->timing.cm_solver_pre_app += t_pa; + data->timing.cm_solver_spmv += t_spmv; + data->timing.cm_solver_tri_solve += t_ts; + data->timing.cm_solver_vector_ops += t_vops; +#endif + + if ( g_itr >= control->cm_solver_max_iters ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return itr * (control->cm_solver_restart + 1) + j + 1; + fprintf( stderr, "[WARNING] GMRES convergence failed (%d outer iters)\n", g_itr ); + return g_itr * (control->cm_solver_restart + 1) + j + 1; } - return itr * (control->cm_solver_restart + 1) + j + 1; + return g_itr * (control->cm_solver_restart + 1) + g_j + 1; } diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h index 9b1593f0ace23b5988a218a88533e87d9cf5560f..814701fe06c81a4fcd33530f9a16f70c12abdf5d 100644 --- a/sPuReMD/src/lin_alg.h +++ b/sPuReMD/src/lin_alg.h @@ -32,8 +32,8 @@ typedef enum void Sort_Matrix_Rows( sparse_matrix * const ); -void Setup_Sparsity_Pattern( const sparse_matrix * const, - const real, sparse_matrix ** ); +void setup_sparse_approx_inverse( const sparse_matrix * const, sparse_matrix **, + sparse_matrix **, sparse_matrix **, sparse_matrix **, const real ); int Estimate_LU_Fill( const sparse_matrix * const, const real * const ); @@ -62,7 +62,7 @@ real ILUT_PAR( const sparse_matrix * const, const real *, const unsigned int, sparse_matrix * const, sparse_matrix * const ); #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) -real Sparse_Approx_Inverse( const sparse_matrix * const, const sparse_matrix * const, +real sparse_approx_inverse( const sparse_matrix * const, const sparse_matrix * const, sparse_matrix ** ); #endif diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index 51f24d401b1337a9d8059df252d51864114c7106..023542b9408131135470014fd0770c180687e16a 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -890,8 +890,10 @@ typedef struct /* charge method storage */ sparse_matrix *H; + sparse_matrix *H_full; sparse_matrix *H_sp; sparse_matrix *H_spar_patt; + sparse_matrix *H_spar_patt_full; sparse_matrix *H_app_inv; sparse_matrix *L; sparse_matrix *U;