diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c index 72ad33318d9d32fe63b7a75e5296f89f26a903c8..5c2d3bed963b38ea951da040cefe96b8dc9b14de 100644 --- a/sPuReMD/src/charges.c +++ b/sPuReMD/src/charges.c @@ -771,10 +771,6 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, } data->timing.cm_sort_mat_rows += Get_Timing_Info( time ); -#if defined(DEBUG) - fprintf( stderr, "H matrix sorted\n" ); -#endif - switch ( control->cm_solver_pre_comp_type ) { case NONE_PC: @@ -791,18 +787,8 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, case ICHOLT_PC: Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); -#endif - fillin = Estimate_LU_Fill( Hptr, workspace->droptol ); -#if defined(DEBUG) - fprintf( stderr, "fillin = %d\n", fillin ); - fprintf( stderr, "allocated memory: L = U = %ldMB\n", - fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) ); -#endif - if ( workspace->L == NULL ) { if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE || @@ -839,10 +825,6 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, case ILUT_PAR_PC: Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); -#endif - if ( workspace->L == NULL ) { /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */ @@ -919,11 +901,12 @@ static void Setup_Preconditioner_EE( const reax_system * const system, } data->timing.cm_sort_mat_rows += Get_Timing_Info( time ); - Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0; - -#if defined(DEBUG) - fprintf( stderr, "H matrix sorted\n" ); -#endif + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILU_PAR_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0; + } switch ( control->cm_solver_pre_comp_type ) { @@ -941,18 +924,8 @@ static void Setup_Preconditioner_EE( const reax_system * const system, case ICHOLT_PC: Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); -#endif - fillin = Estimate_LU_Fill( Hptr, workspace->droptol ); -#if defined(DEBUG) - fprintf( stderr, "fillin = %d\n", fillin ); - fprintf( stderr, "allocated memory: L = U = %ldMB\n", - fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) ); -#endif - if ( workspace->L == NULL ) { if ( Allocate_Matrix( &(workspace->L), system->N_cm, fillin + system->N_cm ) == FAILURE || @@ -989,10 +962,6 @@ static void Setup_Preconditioner_EE( const reax_system * const system, case ILUT_PAR_PC: Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); -#endif - if ( workspace->L == NULL ) { /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */ @@ -1038,7 +1007,12 @@ static void Setup_Preconditioner_EE( const reax_system * const system, break; } - Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0; + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILU_PAR_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0; + } } @@ -1071,12 +1045,13 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, } data->timing.cm_sort_mat_rows += Get_Timing_Info( time ); - Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0; - Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0; - -#if defined(DEBUG) - fprintf( stderr, "H matrix sorted\n" ); -#endif + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILU_PAR_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + Hptr->val[Hptr->start[system->N + 1] - 1] = 1.0; + Hptr->val[Hptr->start[system->N_cm] - 1] = 1.0; + } switch ( control->cm_solver_pre_comp_type ) { @@ -1094,18 +1069,8 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, case ICHOLT_PC: Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); -#endif - fillin = Estimate_LU_Fill( Hptr, workspace->droptol ); -#if defined(DEBUG) - fprintf( stderr, "fillin = %d\n", fillin ); - fprintf( stderr, "allocated memory: L = U = %ldMB\n", - fillin * (sizeof(real) + sizeof(unsigned int)) / (1024 * 1024) ); -#endif - if ( workspace->L == NULL ) { if ( Allocate_Matrix( &(workspace->L), Hptr->n, fillin ) == FAILURE || @@ -1141,10 +1106,6 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, case ILUT_PAR_PC: Calculate_Droptol( Hptr, workspace->droptol, control->cm_solver_pre_comp_droptol ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "drop tolerances calculated\n" ); -#endif - if ( workspace->L == NULL ) { /* TODO: safest storage estimate is ILU(0) (same as lower triangular portion of H), could improve later */ @@ -1190,8 +1151,13 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, break; } - Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0; - Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0; + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILU_PAR_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0; + Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0; + } } @@ -1315,10 +1281,6 @@ static void QEq( reax_system * const system, control_params * const control, data->timing.cm_solver_iters += iters; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "linsolve-" ); -#endif - Calculate_Charges_QEq( system, workspace ); #if defined(DEBUG_FOCUS) @@ -1395,10 +1357,6 @@ static void EE( reax_system * const system, control_params * const control, data->timing.cm_solver_iters += iters; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "linsolve-" ); -#endif - Calculate_Charges_EE( system, workspace ); // if( data->step == control->nsteps ) @@ -1469,10 +1427,6 @@ static void ACKS2( reax_system * const system, control_params * const control, data->timing.cm_solver_iters += iters; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "linsolve-" ); -#endif - Calculate_Charges_EE( system, workspace ); } diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c index 7e8f0922400b532fb5b8e84fb8e9b2741cea6c03..c809655fa4b9796f09122388657b05229f5f96ad 100644 --- a/sPuReMD/src/init_md.c +++ b/sPuReMD/src/init_md.c @@ -501,27 +501,66 @@ void Init_Workspace( reax_system *system, control_params *control, break; } + /* SpMV related */ +#ifdef _OPENMP + workspace->b_local = (real*) smalloc( control->num_threads * system->N_cm * sizeof(real), + "Init_Workspace::b_local" ); +#endif + + /* level scheduling related */ + workspace->levels_L = 1; + workspace->levels_U = 1; + if ( control->cm_solver_pre_app_type == TRI_SOLVE_LEVEL_SCHED_PA || + control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA ) + { + workspace->row_levels_L = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int), + "Init_Workspace::row_levels_L" ); + workspace->level_rows_L = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int), + "Init_Workspace::level_rows_L" ); + workspace->level_rows_cnt_L = (unsigned int*) smalloc( (system->N_cm + 1) * sizeof(unsigned int), + "Init_Workspace::level_rows_cnt_L" ); + workspace->row_levels_U = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int), + "Init_Workspace::row_levels_U" ); + workspace->level_rows_U = (unsigned int*) smalloc( system->N_cm * sizeof(unsigned int), + "Init_Workspace::level_rows_U" ); + workspace->level_rows_cnt_U = (unsigned int*) smalloc( (system->N_cm + 1) * sizeof(unsigned int), + "Init_Workspace::level_rows_cnt_U" ); + workspace->top = (unsigned int*) smalloc( (system->N_cm + 1) * sizeof(unsigned int), + "Init_Workspace::top" ); + } + else + { + workspace->row_levels_L = NULL; + workspace->level_rows_L = NULL; + workspace->level_rows_cnt_L = NULL; + workspace->row_levels_U = NULL; + workspace->level_rows_U = NULL; + workspace->level_rows_cnt_U = NULL; + workspace->top = NULL; + } + /* graph coloring related */ + workspace->recolor_cnt = 0; if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA ) { workspace->color = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm, - "setup_graph_coloring::color" ); + "Init_Workspace::color" ); workspace->to_color = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm, - "setup_graph_coloring::to_color" ); + "Init_Workspace::to_color" ); workspace->conflict = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm, - "setup_graph_coloring::conflict" ); + "setup_graph_coloring::conflict" ); workspace->conflict_cnt = (unsigned int*) smalloc( sizeof(unsigned int) * (control->num_threads + 1), - "setup_graph_coloring::conflict_cnt" ); + "Init_Workspace::conflict_cnt" ); workspace->recolor = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm, - "setup_graph_coloring::recolor" ); + "Init_Workspace::recolor" ); workspace->color_top = (unsigned int*) smalloc( sizeof(unsigned int) * (system->N_cm + 1), - "setup_graph_coloring::color_top" ); + "Init_Workspace::color_top" ); workspace->permuted_row_col = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm, - "setup_graph_coloring::premuted_row_col" ); + "Init_Workspace::premuted_row_col" ); workspace->permuted_row_col_inv = (unsigned int*) smalloc( sizeof(unsigned int) * system->N_cm, - "setup_graph_coloring::premuted_row_col_inv" ); + "Init_Workspace::premuted_row_col_inv" ); workspace->y_p = (real*) smalloc( sizeof(real) * system->N_cm, - "setup_graph_coloring::y_p" ); + "Init_Workspace::y_p" ); workspace->x_p = (real*) smalloc( sizeof(real) * system->N_cm, "Init_Workspace::x_p" ); } @@ -531,9 +570,7 @@ void Init_Workspace( reax_system *system, control_params *control, workspace->to_color = NULL; workspace->conflict = NULL; workspace->conflict_cnt = NULL; - workspace->temp_ptr = NULL; workspace->recolor = NULL; - workspace->recolor_cnt = 0; workspace->color_top = NULL; workspace->permuted_row_col = NULL; workspace->permuted_row_col_inv = NULL; @@ -541,6 +578,29 @@ void Init_Workspace( reax_system *system, control_params *control, workspace->x_p = NULL; } + /* Jacobi iteration related */ + if ( control->cm_solver_pre_app_type == JACOBI_ITER_PA ) + { + workspace->Dinv_L = (real*) smalloc( sizeof(real) * system->N_cm, + "Init_Workspace::Dinv_L" ); + workspace->Dinv_U = (real*) smalloc( sizeof(real) * system->N_cm, + "Init_Workspace::Dinv_U" ); + workspace->Dinv_b = (real*) smalloc( sizeof(real) * system->N_cm, + "Init_Workspace::Dinv_b" ); + workspace->rp = (real*) smalloc( sizeof(real) * system->N_cm, + "Init_Workspace::rp" ); + workspace->rp2 = (real*) smalloc( sizeof(real) * system->N_cm, + "Init_Workspace::rp2" ); + } + else + { + workspace->Dinv_L = NULL; + workspace->Dinv_U = NULL; + workspace->Dinv_b = NULL; + workspace->rp = NULL; + workspace->rp2 = NULL; + } + /* integrator storage */ workspace->a = (rvec *) smalloc( system->N * sizeof( rvec ), "Init_Workspace::workspace->a" ); @@ -993,7 +1053,7 @@ void Initialize( reax_system *system, control_params *control, #endif #ifdef _OPENMP - #pragma omp parallel default(shared) + #pragma omp parallel default(none) shared(control) { #pragma omp single control->num_threads = omp_get_num_threads( ); @@ -1211,6 +1271,24 @@ void Finalize_Workspace( reax_system *system, control_params *control, break; } + /* SpMV related */ +#ifdef _OPENMP + sfree( workspace->b_local, "Finalize_Workspace::b_local" ); +#endif + + /* level scheduling related */ + if ( control->cm_solver_pre_app_type == TRI_SOLVE_LEVEL_SCHED_PA || + control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA ) + { + sfree( workspace->row_levels_L, "Finalize_Workspace::row_levels_L" ); + sfree( workspace->level_rows_L, "Finalize_Workspace::level_rows_L" ); + sfree( workspace->level_rows_cnt_L, "Finalize_Workspace::level_rows_cnt_L" ); + sfree( workspace->row_levels_U, "Finalize_Workspace::row_levels_U" ); + sfree( workspace->level_rows_U, "Finalize_Workspace::level_rows_U" ); + sfree( workspace->level_rows_cnt_U, "Finalize_Workspace::level_rows_cnt_U" ); + sfree( workspace->top, "Finalize_Workspace::top" ); + } + /* graph coloring related */ if ( control->cm_solver_pre_app_type == TRI_SOLVE_GC_PA ) { @@ -1226,6 +1304,16 @@ void Finalize_Workspace( reax_system *system, control_params *control, sfree( workspace->x_p, "Finalize_Workspace::workspace->x_p" ); } + /* Jacobi iteration related */ + if ( control->cm_solver_pre_app_type == JACOBI_ITER_PA ) + { + sfree( workspace->Dinv_L, "Finalize_Workspace::Dinv_L" ); + sfree( workspace->Dinv_U, "Finalize_Workspace::Dinv_U" ); + sfree( workspace->Dinv_b, "Finalize_Workspace::Dinv_b" ); + sfree( workspace->rp, "Finalize_Workspace::rp" ); + sfree( workspace->rp2, "Finalize_Workspace::rp2" ); + } + /* integrator storage */ sfree( workspace->a, "Finalize_Workspace::workspace->a" ); sfree( workspace->f_old, "Finalize_Workspace::workspace->f_old" ); diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 5ddead00bce98056396a449517ff507b719055de..69e862bb777e9f9245b6c45f8bb30d460b1a23c7 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -50,34 +50,6 @@ enum preconditioner_type }; -/* global to make OpenMP shared (Sparse_MatVec) */ -#ifdef _OPENMP -static real *b_local = NULL; -#endif -/* global to make OpenMP shared (apply_preconditioner) */ -static real *Dinv_L = NULL; -static real *Dinv_U = NULL; -/* global to make OpenMP shared (tri_solve_level_sched) */ -static int levels = 1; -static int levels_L = 1; -static int levels_U = 1; -static unsigned int *row_levels_L = NULL; -static unsigned int *level_rows_L = NULL; -static unsigned int *level_rows_cnt_L = NULL; -static unsigned int *row_levels_U = NULL; -static unsigned int *level_rows_U = NULL; -static unsigned int *level_rows_cnt_U = NULL; -static unsigned int *row_levels; -static unsigned int *level_rows; -static unsigned int *level_rows_cnt; -static unsigned int *top = NULL; -/* global to make OpenMP shared (jacobi_iter) */ -static real *Dinv_b = NULL; -static real *rp = NULL; -static real *rp2 = NULL; -static real *rp3 = NULL; - - #if defined(TEST_MAT) static sparse_matrix * create_test_mat( void ) { @@ -415,8 +387,6 @@ void Calculate_Droptol( const sparse_matrix * const A, #pragma omp master { - /* keep b_local for program duration to avoid allocate/free - * overhead per Sparse_MatVec call*/ if ( droptol_local == NULL ) { droptol_local = (real*) smalloc( omp_get_num_threads() * A->n * sizeof(real), @@ -1842,12 +1812,13 @@ real sparse_approx_inverse( const sparse_matrix * const A, /* sparse matrix-vector product Ax = b - * where: - * A: lower triangular matrix, stored in CSR format - * x: vector - * b: vector (result) */ -static void Sparse_MatVec( const sparse_matrix * const A, - const real * const x, real * const b ) + * + * workspace: storage container for workspace structures + * A: lower triangular matrix, stored in CSR format + * x: vector + * b (output): vector */ +static void Sparse_MatVec( const static_storage * const workspace, + const sparse_matrix * const A, const real * const x, real * const b ) { int i, j, k, n, si, ei; real H; @@ -1861,18 +1832,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, #ifdef _OPENMP tid = omp_get_thread_num( ); - #pragma omp single - { - /* keep b_local for program duration to avoid allocate/free - * overhead per Sparse_MatVec call*/ - if ( b_local == NULL ) - { - b_local = (real*) smalloc( omp_get_num_threads() * n * sizeof(real), - "Sparse_MatVec::b_local" ); - } - } - - Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n ); + Vector_MakeZero( workspace->b_local, omp_get_num_threads() * n ); #pragma omp for schedule(static) #endif @@ -1886,8 +1846,8 @@ static void Sparse_MatVec( const sparse_matrix * const A, j = A->j[k]; H = A->val[k]; #ifdef _OPENMP - b_local[tid * n + j] += H * x[i]; - b_local[tid * n + i] += H * x[j]; + workspace->b_local[tid * n + j] += H * x[i]; + workspace->b_local[tid * n + i] += H * x[j]; #else b[j] += H * x[i]; b[i] += H * x[j]; @@ -1896,7 +1856,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, // the diagonal entry is the last one in #ifdef _OPENMP - b_local[tid * n + i] += A->val[k] * x[i]; + workspace->b_local[tid * n + i] += A->val[k] * x[i]; #else b[i] += A->val[k] * x[i]; #endif @@ -1908,7 +1868,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, { for ( j = 0; j < omp_get_num_threads(); ++j ) { - b[i] += b_local[j * n + i]; + b[i] += workspace->b_local[j * n + i]; } } #endif @@ -2090,6 +2050,7 @@ void tri_solve( const sparse_matrix * const LU, const real * const y, /* Solve triangular system LU*x = y using level scheduling * + * workspace: storage container for workspace structures * LU: lower/upper triangular, stored in CSR * y: constants in linear system (RHS) * x: solution @@ -2100,53 +2061,38 @@ void tri_solve( const sparse_matrix * const LU, const real * const y, * Assumptions: * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ -void tri_solve_level_sched( const sparse_matrix * const LU, - const real * const y, real * const x, const int N, - const TRIANGULARITY tri, int find_levels ) +void tri_solve_level_sched( static_storage * workspace, + const sparse_matrix * const LU, + const real * const y, real * const x, const int N, + const TRIANGULARITY tri, int find_levels ) { int i, j, pj, local_row, local_level; + unsigned int *row_levels, *level_rows, *level_rows_cnt; + int levels; + + if ( tri == LOWER ) + { + row_levels = workspace->row_levels_L; + level_rows = workspace->level_rows_L; + level_rows_cnt = workspace->level_rows_cnt_L; + } + else + { + row_levels = workspace->row_levels_U; + level_rows = workspace->level_rows_U; + level_rows_cnt = workspace->level_rows_cnt_U; + } #ifdef _OPENMP #pragma omp single #endif { - if ( tri == LOWER ) - { - row_levels = row_levels_L; - level_rows = level_rows_L; - level_rows_cnt = level_rows_cnt_L; - levels = levels_L; - } - else - { - row_levels = row_levels_U; - level_rows = level_rows_U; - level_rows_cnt = level_rows_cnt_U; - levels = levels_U; - } - - if ( row_levels == NULL || level_rows == NULL || level_rows_cnt == NULL ) - { - row_levels = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int), - "tri_solve_level_sched::row_levels" ); - level_rows = (unsigned int*) smalloc( (size_t)N * sizeof(unsigned int), - "tri_solve_level_sched::level_rows" ); - level_rows_cnt = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int), - "tri_solve_level_sched::level_rows_cnt" ); - } - - if ( top == NULL ) - { - top = (unsigned int*) smalloc( (size_t)(N + 1) * sizeof(unsigned int), - "tri_solve_level_sched::top" ); - } - /* find levels (row dependencies in substitutions) */ if ( find_levels == TRUE ) { memset( row_levels, 0, N * sizeof(unsigned int) ); memset( level_rows_cnt, 0, N * sizeof(unsigned int) ); - memset( top, 0, N * sizeof(unsigned int) ); + memset( workspace->top, 0, N * sizeof(unsigned int) ); levels = 1; if ( tri == LOWER ) @@ -2164,6 +2110,8 @@ void tri_solve_level_sched( const sparse_matrix * const LU, ++level_rows_cnt[local_level]; } + workspace->levels_L = levels; + //#if defined(DEBUG) fprintf(stderr, "levels(L): %d\n", levels); fprintf(stderr, "NNZ(L): %d\n", LU->start[N]); @@ -2184,6 +2132,8 @@ void tri_solve_level_sched( const sparse_matrix * const LU, ++level_rows_cnt[local_level]; } + workspace->levels_U = levels; + //#if defined(DEBUG) fprintf(stderr, "levels(U): %d\n", levels); fprintf(stderr, "NNZ(U): %d\n", LU->start[N]); @@ -2193,17 +2143,26 @@ void tri_solve_level_sched( const sparse_matrix * const LU, for ( i = 1; i < levels + 1; ++i ) { level_rows_cnt[i] += level_rows_cnt[i - 1]; - top[i] = level_rows_cnt[i]; + workspace->top[i] = level_rows_cnt[i]; } for ( i = 0; i < N; ++i ) { - level_rows[top[row_levels[i] - 1]] = i; - ++top[row_levels[i] - 1]; + level_rows[workspace->top[row_levels[i] - 1]] = i; + ++workspace->top[row_levels[i] - 1]; } } } + if ( tri == LOWER ) + { + levels = workspace->levels_L; + } + else + { + levels = workspace->levels_U; + } + /* perform substitutions by level */ if ( tri == LOWER ) { @@ -2243,27 +2202,6 @@ void tri_solve_level_sched( const sparse_matrix * const LU, } } } - -#ifdef _OPENMP - #pragma omp single -#endif - { - /* save level info for re-use if performing repeated triangular solves via preconditioning */ - if ( tri == LOWER ) - { - row_levels_L = row_levels; - level_rows_L = level_rows; - level_rows_cnt_L = level_rows_cnt; - levels_L = levels; - } - else - { - row_levels_U = row_levels; - level_rows_U = level_rows; - level_rows_cnt_U = level_rows_cnt; - levels_U = levels; - } - } } @@ -2283,7 +2221,8 @@ void tri_solve_level_sched( const sparse_matrix * const LU, * and Massively Threaded Architectures * Parallel Computing, 2012 */ -void graph_coloring( const control_params * const control, static_storage * const workspace, +void graph_coloring( const control_params * const control, + static_storage * workspace, const sparse_matrix * const A, const TRIANGULARITY tri ) { #ifdef _OPENMP @@ -2293,18 +2232,28 @@ void graph_coloring( const control_params * const control, static_storage * cons int i, pj, v; unsigned int temp, recolor_cnt_local, *conflict_local; int tid, *fb_color; + unsigned int *p_to_color, *p_conflict, *p_temp; #ifdef _OPENMP - tid = omp_get_thread_num(); + tid = omp_get_thread_num( ); #else tid = 0; #endif + p_to_color = workspace->to_color; + p_conflict = workspace->conflict; + +#ifdef _OPENMP + #pragma omp for schedule(static) +#endif + for ( i = 0; i < A->n; ++i ) + { + workspace->color[i] = 0; + } #ifdef _OPENMP #pragma omp single #endif { - memset( workspace->color, 0, sizeof(unsigned int) * A->n ); workspace->recolor_cnt = A->n; } @@ -2317,7 +2266,7 @@ void graph_coloring( const control_params * const control, static_storage * cons #endif for ( i = 0; i < A->n; ++i ) { - workspace->to_color[i] = i; + p_to_color[i] = i; } } else @@ -2327,14 +2276,14 @@ void graph_coloring( const control_params * const control, static_storage * cons #endif for ( i = 0; i < A->n; ++i ) { - workspace->to_color[i] = A->n - 1 - i; + p_to_color[i] = A->n - 1 - i; } } fb_color = (int*) smalloc( sizeof(int) * A->n, - "graph_coloring::fb_color" ); + "graph_coloring::fb_color" ); conflict_local = (unsigned int*) smalloc( sizeof(unsigned int) * A->n, - "graph_coloring::fb_color" ); + "graph_coloring::fb_color" ); while ( workspace->recolor_cnt > 0 ) { @@ -2346,7 +2295,7 @@ void graph_coloring( const control_params * const control, static_storage * cons #endif for ( i = 0; i < workspace->recolor_cnt; ++i ) { - v = workspace->to_color[i]; + v = p_to_color[i]; /* colors of adjacent vertices are forbidden */ for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj ) @@ -2371,6 +2320,8 @@ void graph_coloring( const control_params * const control, static_storage * cons recolor_cnt_local = 0; #ifdef _OPENMP + #pragma omp barrier + #pragma omp single #endif { @@ -2382,7 +2333,7 @@ void graph_coloring( const control_params * const control, static_storage * cons #endif for ( i = 0; i < temp; ++i ) { - v = workspace->to_color[i]; + v = p_to_color[i]; /* search for color conflicts with adjacent vertices */ for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj ) @@ -2416,20 +2367,16 @@ void graph_coloring( const control_params * const control, static_storage * cons /* copy thread-local conflicts into shared buffer */ for ( i = 0; i < recolor_cnt_local; ++i ) { - workspace->conflict[workspace->conflict_cnt[tid] + i] = conflict_local[i]; + p_conflict[workspace->conflict_cnt[tid] + i] = conflict_local[i]; workspace->color[conflict_local[i]] = 0; } #ifdef _OPENMP #pragma omp barrier - - #pragma omp single #endif - { - workspace->temp_ptr = workspace->to_color; - workspace->to_color = workspace->conflict; - workspace->conflict = workspace->temp_ptr; - } + p_temp = p_to_color; + p_to_color = p_conflict; + p_conflict = p_temp; } sfree( conflict_local, "graph_coloring::conflict_local" ); @@ -2444,7 +2391,7 @@ void graph_coloring( const control_params * const control, static_storage * cons * n: number of entries in coloring * tri: coloring to triangular factor to use (lower/upper) */ -void sort_rows_by_colors( static_storage * const workspace, +void sort_rows_by_colors( const static_storage * const workspace, const unsigned int n, const TRIANGULARITY tri ) { unsigned int i; @@ -2488,7 +2435,7 @@ void sort_rows_by_colors( static_storage * const workspace, * invert_map: if TRUE, use Q^T, otherwise use Q * tri: coloring to triangular factor to use (lower/upper) */ -static void permute_vector( static_storage * workspace, +static void permute_vector( const static_storage * const workspace, real * const x, const unsigned int n, const int invert_map, const TRIANGULARITY tri ) { @@ -2529,7 +2476,7 @@ static void permute_vector( static_storage * workspace, * LU: matrix to permute, stored in CSR format * tri: triangularity of LU (lower/upper) */ -void permute_matrix( static_storage * const workspace, +void permute_matrix( const static_storage * const workspace, sparse_matrix * const LU, const TRIANGULARITY tri ) { int i, pj, nr, nc; @@ -2669,7 +2616,7 @@ void permute_matrix( static_storage * const workspace, * H_p (output): permuted copy of H based on coloring, lower half stored, CSR format */ void setup_graph_coloring( const control_params * const control, - static_storage * const workspace, const sparse_matrix * const H, + const static_storage * const workspace, const sparse_matrix * const H, sparse_matrix ** H_full, sparse_matrix ** H_p ) { if ( *H_p == NULL ) @@ -2692,7 +2639,7 @@ void setup_graph_coloring( const control_params * const control, compute_full_sparse_matrix( H, H_full ); - graph_coloring( control, workspace, *H_full, LOWER ); + graph_coloring( control, (static_storage *) workspace, *H_full, LOWER ); sort_rows_by_colors( workspace, (*H_full)->n, LOWER ); memcpy( (*H_p)->start, H->start, sizeof(int) * (H->n + 1) ); @@ -2712,36 +2659,31 @@ void setup_graph_coloring( const control_params * const control, * triangular factors in iterative linear solvers * * Note: Newmann series arises from series expansion of the inverse of - * the coefficient matrix in the triangular system */ -void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, - const real * const b, real * const x, const TRIANGULARITY tri, const - unsigned int maxiter ) + * the coefficient matrix in the triangular system + * + * workspace: storage container for workspace structures + * R: + * Dinv: + * b: + * x (output): + * tri: + * maxiter: + * */ +void jacobi_iter( const static_storage * const workspace, + const sparse_matrix * const R, const real * const Dinv, + const real * const b, real * const x, const TRIANGULARITY tri, + const unsigned int maxiter ) { unsigned int i, k, si, ei, iter; + real *p1, *p2, *p3; si = 0; ei = 0; iter = 0; + p1 = workspace->rp; + p2 = workspace->rp2; -#ifdef _OPENMP - #pragma omp single -#endif - { - if ( Dinv_b == NULL ) - { - Dinv_b = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::Dinv_b" ); - } - if ( rp == NULL ) - { - rp = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp" ); - } - if ( rp2 == NULL ) - { - rp2 = (real*) smalloc( sizeof(real) * R->n, "jacobi_iter::rp2" ); - } - } - - Vector_MakeZero( rp, R->n ); + Vector_MakeZero( p1, R->n ); /* precompute and cache, as invariant in loop below */ #ifdef _OPENMP @@ -2749,12 +2691,12 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, #endif for ( i = 0; i < R->n; ++i ) { - Dinv_b[i] = Dinv[i] * b[i]; + workspace->Dinv_b[i] = Dinv[i] * b[i]; } do { - // x_{k+1} = G*x_{k} + Dinv*b; + /* x_{k+1} = G*x_{k} + Dinv*b */ #ifdef _OPENMP #pragma omp for schedule(guided) #endif @@ -2772,31 +2714,26 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, ei = R->start[i + 1]; } - rp2[i] = 0.; + p2[i] = 0.; for ( k = si; k < ei; ++k ) { - rp2[i] += R->val[k] * rp[R->j[k]]; + p2[i] += R->val[k] * p1[R->j[k]]; } - rp2[i] *= -Dinv[i]; - rp2[i] += Dinv_b[i]; + p2[i] *= -Dinv[i]; + p2[i] += workspace->Dinv_b[i]; } -#ifdef _OPENMP - #pragma omp single -#endif - { - rp3 = rp; - rp = rp2; - rp2 = rp3; - } + p3 = p1; + p1 = p2; + p2 = p3; ++iter; } while ( iter < maxiter ); - Vector_Copy( x, rp, R->n ); + Vector_Copy( x, p1, R->n ); } @@ -2864,7 +2801,8 @@ static void apply_preconditioner( const static_storage * const workspace, const case ICHOLT_PC: case ILU_PAR_PC: case ILUT_PAR_PC: - tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); + tri_solve_level_sched( (static_storage *) workspace, + workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); break; case SAI_PC: Sparse_MatVec_full( workspace->H_app_inv, y, x ); @@ -2887,14 +2825,16 @@ static void apply_preconditioner( const static_storage * const workspace, const case ILU_PAR_PC: case ILUT_PAR_PC: #ifdef _OPENMP - #pragma omp single + #pragma omp for schedule(static) #endif + for ( i = 0; i < workspace->H->n; ++i ) { - memcpy( workspace->y_p, y, sizeof(real) * workspace->H->n ); + workspace->y_p[i] = y[i]; } - permute_vector( (static_storage *) workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); - tri_solve_level_sched( workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); + permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); + tri_solve_level_sched( (static_storage *) workspace, + workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); @@ -2913,18 +2853,7 @@ static void apply_preconditioner( const static_storage * const workspace, const case ICHOLT_PC: case ILU_PAR_PC: case ILUT_PAR_PC: -#ifdef _OPENMP - #pragma omp single -#endif - { - if ( Dinv_L == NULL ) - { - Dinv_L = (real*) smalloc( sizeof(real) * workspace->L->n, - "apply_preconditioner::Dinv_L" ); - } - } - - /* construct D^{-1}_L */ + /* construct D^{-1}_L */ if ( fresh_pre == TRUE ) { #ifdef _OPENMP @@ -2933,11 +2862,12 @@ static void apply_preconditioner( const static_storage * const workspace, const for ( i = 0; i < workspace->L->n; ++i ) { si = workspace->L->start[i + 1] - 1; - Dinv_L[i] = 1.0 / workspace->L->val[si]; + workspace->Dinv_L[i] = 1.0 / workspace->L->val[si]; } } - jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); + jacobi_iter( workspace, workspace->L, workspace->Dinv_L, + y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); @@ -2990,7 +2920,8 @@ static void apply_preconditioner( const static_storage * const workspace, const case ICHOLT_PC: case ILU_PAR_PC: case ILUT_PAR_PC: - tri_solve_level_sched( workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); + tri_solve_level_sched( (static_storage *) workspace, + workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); @@ -3009,8 +2940,9 @@ static void apply_preconditioner( const static_storage * const workspace, const case ICHOLT_PC: case ILU_PAR_PC: case ILUT_PAR_PC: - tri_solve_level_sched( workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); - permute_vector( (static_storage *) workspace, x, workspace->H->n, TRUE, UPPER ); + tri_solve_level_sched( (static_storage *) workspace, + workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); + permute_vector( workspace, x, workspace->H->n, TRUE, UPPER ); break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); @@ -3029,18 +2961,7 @@ static void apply_preconditioner( const static_storage * const workspace, const case ICHOLT_PC: case ILU_PAR_PC: case ILUT_PAR_PC: -#ifdef _OPENMP - #pragma omp single -#endif - { - if ( Dinv_U == NULL ) - { - Dinv_U = (real*) smalloc( sizeof(real) * workspace->U->n, - "apply_preconditioner::Dinv_U" ); - } - } - - /* construct D^{-1}_U */ + /* construct D^{-1}_U */ if ( fresh_pre == TRUE ) { #ifdef _OPENMP @@ -3049,11 +2970,12 @@ static void apply_preconditioner( const static_storage * const workspace, const for ( i = 0; i < workspace->U->n; ++i ) { si = workspace->U->start[i]; - Dinv_U[i] = 1. / workspace->U->val[si]; + workspace->Dinv_U[i] = 1.0 / workspace->U->val[si]; } } - jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); + jacobi_iter( workspace, workspace->U, workspace->Dinv_U, + y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); @@ -3115,7 +3037,7 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* calculate r0 */ t_start = Get_Time( ); - Sparse_MatVec( H, x, workspace->b_prm ); + Sparse_MatVec( workspace, H, x, workspace->b_prm ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3145,7 +3067,7 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* matvec */ t_start = Get_Time( ); - Sparse_MatVec( H, workspace->v[j], workspace->b_prc ); + Sparse_MatVec( workspace, H, workspace->v[j], workspace->b_prc ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3323,7 +3245,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, { /* compute z = r0 */ t_start = Get_Time( ); - Sparse_MatVec( H, x, workspace->b_prm ); + Sparse_MatVec( workspace, H, x, workspace->b_prm ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3364,7 +3286,7 @@ int GMRES_HouseHolder( const static_storage * const workspace, /* matvec */ t_start = Get_Time( ); - Sparse_MatVec( H, z[j], workspace->b_prc ); + Sparse_MatVec( workspace, H, z[j], workspace->b_prc ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3553,7 +3475,7 @@ int CG( const static_storage * const workspace, const control_params * const con t_vops += Get_Timing_Info( t_start ); t_start = Get_Time( ); - Sparse_MatVec( H, x, d ); + Sparse_MatVec( workspace, H, x, d ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3574,7 +3496,7 @@ int CG( const static_storage * const workspace, const control_params * const con for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { t_start = Get_Time( ); - Sparse_MatVec( H, p, d ); + Sparse_MatVec( workspace, H, p, d ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3649,7 +3571,7 @@ int SDM( const static_storage * const workspace, const control_params * const co t_vops += Get_Timing_Info( t_start ); t_start = Get_Time( ); - Sparse_MatVec( H, x, workspace->q ); + Sparse_MatVec( workspace, H, x, workspace->q ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); @@ -3668,7 +3590,7 @@ int SDM( const static_storage * const workspace, const control_params * const co for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / b_norm > tol; ++i ) { t_start = Get_Time( ); - Sparse_MatVec( H, workspace->d, workspace->q ); + Sparse_MatVec( workspace, H, workspace->d, workspace->q ); t_spmv += Get_Timing_Info( t_start ); t_start = Get_Time( ); diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h index 164aa05a91a7d6517dfb04ff7a716f96e22faf21..ab0237c14281eb31ddabcd41087b00a475d0ccd0 100644 --- a/sPuReMD/src/lin_alg.h +++ b/sPuReMD/src/lin_alg.h @@ -73,16 +73,18 @@ void Transpose_I( sparse_matrix * const ); void tri_solve( const sparse_matrix * const, const real * const, real * const, const int, const TRIANGULARITY ); -void tri_solve_level_sched( const sparse_matrix * const, +void tri_solve_level_sched( static_storage *, + const sparse_matrix * const, const real * const, real * const, const int, const TRIANGULARITY, int ); -void jacobi_iter( const sparse_matrix * const, const real * const, +void jacobi_iter( const static_storage * const, + const sparse_matrix * const, const real * const, const real * const, real * const, const TRIANGULARITY, const unsigned int ); void setup_graph_coloring( const control_params * const, - static_storage * const, const sparse_matrix * const, + const static_storage * const, const sparse_matrix * const, sparse_matrix **, sparse_matrix ** ); int GMRES( const static_storage * const, const control_params * const, diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index 24ed7fd32bf7d7f9b71082cfc10ea2574716f8c9..61de424a6363a79a6ce51f993ebade2ca7d7abbd 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -977,13 +977,29 @@ typedef struct real *q; real *p; + /* SpMV related storage */ +#ifdef _OPENMP + real *b_local; +#endif + + /* Level scheduling related storage for applying, e.g. ICHOLT and ILU(T), + * preconditioners */ + int levels_L; + int levels_U; + unsigned int *row_levels_L; + unsigned int *level_rows_L; + unsigned int *level_rows_cnt_L; + unsigned int *row_levels_U; + unsigned int *level_rows_U; + unsigned int *level_rows_cnt_U; + unsigned int *top; + /* Graph coloring related storage for applying, e.g. ICHOLT and ILU(T), * preconditioners */ unsigned int *color; unsigned int *to_color; unsigned int *conflict; unsigned int *conflict_cnt; - unsigned int *temp_ptr; unsigned int *recolor; unsigned int recolor_cnt; unsigned int *color_top; @@ -992,6 +1008,14 @@ typedef struct real *y_p; real *x_p; + /* Jacobi iteration related storage for applying, e.g. ICHOLT and ILU(T), + * preconditioners */ + real *Dinv_L; + real *Dinv_U; + real *Dinv_b; + real *rp; + real *rp2; + int num_H; int *hbond_index; // for hydrogen bonds