From 3bbf2ba329cb69e1643139e5fe871fee84615ddc Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Wed, 18 May 2016 11:01:41 -0700 Subject: [PATCH] Refactor shared-memory parallelism. Add performance timing code for SpMV. --- puremd_rc_1003/sPuReMD/GMRES.c | 159 ++++++++++++++++++--------- puremd_rc_1003/sPuReMD/GMRES.h | 6 +- puremd_rc_1003/sPuReMD/init_md.c | 5 +- puremd_rc_1003/sPuReMD/mytypes.h | 7 +- puremd_rc_1003/sPuReMD/print_utils.c | 6 +- puremd_rc_1003/sPuReMD/vector.c | 11 -- puremd_rc_1003/sPuReMD/vector.h | 1 - 7 files changed, 118 insertions(+), 77 deletions(-) diff --git a/puremd_rc_1003/sPuReMD/GMRES.c b/puremd_rc_1003/sPuReMD/GMRES.c index 9103ba94..8f8d0d20 100644 --- a/puremd_rc_1003/sPuReMD/GMRES.c +++ b/puremd_rc_1003/sPuReMD/GMRES.c @@ -38,7 +38,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, int i, j, k, n, si, ei; real H; #ifdef _OPENMP - static real **b_local; + static real *b_local; unsigned int tid; #endif @@ -46,7 +46,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, Vector_MakeZero( b, n ); #pragma omp parallel \ - default(none) shared(n, b_local) private(si, ei, H, i, j, k, tid) + default(none) shared(n, b, b_local) private(si, ei, H, i, j, k, tid) { #ifdef _OPENMP tid = omp_get_thread_num(); @@ -57,19 +57,16 @@ static void Sparse_MatVec( const sparse_matrix * const A, * overhead per Sparse_MatVec call*/ if ( b_local == NULL ) { - b_local = (real**) malloc( omp_get_num_threads() * sizeof(real*)); - for ( i = 0; i < omp_get_num_threads(); ++i ) - { - if ( (b_local[i] = (real*) malloc( n * sizeof(real))) == NULL ) - { - exit( INSUFFICIENT_SPACE ); - } - } + if ( (b_local = (real*) malloc( omp_get_num_threads() * n * sizeof(real))) == NULL ) + { + exit( INSUFFICIENT_SPACE ); + } } + + Vector_MakeZero( b_local, omp_get_num_threads() * n ); } #pragma omp barrier - Vector_MakeZero( b_local[tid], n ); #endif #pragma omp for schedule(guided) for ( i = 0; i < n; ++i ) @@ -82,8 +79,8 @@ static void Sparse_MatVec( const sparse_matrix * const A, j = A->entries[k].j; H = A->entries[k].val; #ifdef _OPENMP - b_local[tid][j] += H * x[i]; - b_local[tid][i] += H * x[j]; + b_local[tid * n + j] += H * x[i]; + b_local[tid * n + i] += H * x[j]; #else b[j] += H * x[i]; b[i] += H * x[j]; @@ -92,20 +89,18 @@ static void Sparse_MatVec( const sparse_matrix * const A, // the diagonal entry is the last one in #ifdef _OPENMP - b_local[tid][i] += A->entries[k].val * x[i]; + b_local[tid * n + i] += A->entries[k].val * x[i]; #else b[i] += A->entries[k].val * x[i]; #endif } #ifdef _OPENMP - #pragma omp master + #pragma omp for schedule(guided) + for ( i = 0; i < n; ++i ) { - for ( i = 0; i < omp_get_num_threads(); ++i ) + for ( j = 0; j < omp_get_num_threads(); ++j ) { - for ( j = 0; j < n; ++j ) - { - b[j] += b_local[i][j]; - } + b[i] += b_local[j * n + i]; } } #endif @@ -115,30 +110,48 @@ static void Sparse_MatVec( const sparse_matrix * const A, /* sparse matrix-vector product Gx=b (for Jacobi iteration), + * followed by vector addition of D^{-1}b, * where G = (I - D^{-1}R) (G not explicitly computed and stored) * R: strictly triangular matrix (diagonals not used) * tri: triangularity of A (lower/upper) - * D_inv: inverse of the diagonal of R + * D^{-1} (D_inv): inverse of the diagonal of R * x: vector - * b: vector (result) */ -static void Sparse_MatVec2( const sparse_matrix * const R, + * b: vector (result) + * D^{-1}b (Dinv_b): precomputed vector-vector product */ +static void Sparse_MatVec_Vector_Add( const sparse_matrix * const R, const TRIANGULARITY tri, const real * const Dinv, - const real * const x, real * const b) + const real * const x, real * const b, const real * const Dinv_b) { int i, k, si = 0, ei = 0; #ifdef _OPENMP - real *b_local; + static real *b_local; + unsigned int tid; #endif Vector_MakeZero( b, R->n ); - #pragma omp parallel default(none) private(b_local, i, k) firstprivate(si, ei) + #pragma omp parallel \ + default(none) shared(b, b_local) private(si, ei, i, k, tid) { #ifdef _OPENMP - if ( (b_local = (real*) calloc(R->n, sizeof(real))) == NULL ) + tid = omp_get_thread_num(); + + #pragma omp master { - exit( INSUFFICIENT_SPACE ); - } + /* keep b_local for program duration to avoid allocate/free + * overhead per Sparse_MatVec call*/ + if ( b_local == NULL ) + { + if ( (b_local = (real*) malloc( omp_get_num_threads() * R->n * sizeof(real))) == NULL ) + { + exit( INSUFFICIENT_SPACE ); + } + } + + Vector_MakeZero( b_local, omp_get_num_threads() * R->n ); + } + #pragma omp barrier + #endif #pragma omp for schedule(guided) for ( i = 0; i < R->n; ++i ) @@ -158,27 +171,28 @@ static void Sparse_MatVec2( const sparse_matrix * const R, for ( k = si; k < ei; ++k ) { #ifdef _OPENMP - b_local[i] += R->entries[k].val * x[R->entries[k].j]; + b_local[tid * R->n + i] += R->entries[k].val * x[R->entries[k].j]; #else b[i] += R->entries[k].val * x[R->entries[k].j]; #endif } #ifdef _OPENMP - b_local[i] *= -Dinv[i]; + b_local[tid * R->n + i] *= -Dinv[i]; #else b[i] *= -Dinv[i]; #endif } #ifdef _OPENMP - #pragma omp critical(redux) + #pragma omp for schedule(guided) + for ( i = 0; i < R->n; ++i ) { - for ( i = 0; i < R->n; ++i ) + for ( k = 0; k < omp_get_num_threads(); ++k ) { - b[i] += b_local[i]; + b[i] += b_local[k * R->n + i]; } - } - free(b_local); + b[i] += Dinv_b[i]; + } #endif } } @@ -237,23 +251,40 @@ void Backward_Subs( sparse_matrix *U, real *y, real *x ) * D = diagonal matrix, diagonals from R * * Note: used during the backsolves when applying preconditioners with - * triangular factors to iterative linear solvers + * triangular factors in iterative linear solvers * * Note: Newmann series arises from series expansion of the inverse of - * the coefficient matrix in the triangular system. */ -static void Jacobi_Iter( const sparse_matrix * const G, const TRIANGULARITY tri, + * the coefficient matrix in the triangular system */ +static void Jacobi_Iter( const sparse_matrix * const R, const TRIANGULARITY tri, const real * const Dinv, const unsigned int n, const real * const b, real * const x, const unsigned int maxiter ) { unsigned int i, iter = 0; - real *Dinv_b, *rp, *rp2, *rp3; + static real *Dinv_b, *rp, *rp2, *rp3; - if ( (Dinv_b = (real*) malloc(sizeof(real) * n)) == NULL - || (rp = (real*) malloc(sizeof(real) * n)) == NULL - || (rp2 = (real*) malloc(sizeof(real) * n)) == NULL ) + if ( Dinv_b == NULL ) { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit(INSUFFICIENT_SPACE); + if ( (Dinv_b = (real*) malloc(sizeof(real) * n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit(INSUFFICIENT_SPACE); + } + } + if ( rp == NULL ) + { + if ( (rp = (real*) malloc(sizeof(real) * n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit(INSUFFICIENT_SPACE); + } + } + if ( rp2 == NULL ) + { + if ( (rp2 = (real*) malloc(sizeof(real) * n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit(INSUFFICIENT_SPACE); + } } Vector_MakeZero( rp, n ); @@ -266,10 +297,8 @@ static void Jacobi_Iter( const sparse_matrix * const G, const TRIANGULARITY tri, do { - // Issue: G = I - D^{-1}R, NOT R // x_{k+1} = G*x_{k} + Dinv*b; - Sparse_MatVec2( (sparse_matrix*)G, tri, Dinv, rp, rp2 ); - Vector_Add2( rp2, Dinv_b, n ); + Sparse_MatVec_Vector_Add( (sparse_matrix*)R, tri, Dinv, rp, rp2, Dinv_b ); rp3 = rp; rp = rp2; rp2 = rp3; @@ -278,17 +307,13 @@ static void Jacobi_Iter( const sparse_matrix * const G, const TRIANGULARITY tri, while ( iter < maxiter ); Vector_Copy( x, rp, n ); - - free(rp2); - free(rp); - free(Dinv_b); } /* generalized minimual residual iterative solver for sparse linear systems, * diagonal preconditioner */ int GMRES( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, real *time ) + real *b, real tol, real *x, FILE *fout, real *time, real *spmv_time ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; @@ -308,7 +333,11 @@ int GMRES( static_storage *workspace, sparse_matrix *H, for ( itr = 0; itr < MAX_ITR; ++itr ) { /* calculate r0 */ + gettimeofday( &start, NULL ); Sparse_MatVec( H, x, workspace->b_prm ); + gettimeofday( &stop, NULL ); + *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0) + - (start.tv_sec + start.tv_usec / 1000000.0); for ( i = 0; i < N; ++i ) workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ @@ -321,7 +350,11 @@ int GMRES( static_storage *workspace, sparse_matrix *H, for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { /* matvec */ + gettimeofday( &start, NULL ); Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); + gettimeofday( &stop, NULL ); + *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0) + - (start.tv_sec + start.tv_usec / 1000000.0); /*pre-conditioner*/ gettimeofday( &start, NULL ); for ( k = 0; k < N; ++k ) @@ -615,7 +648,7 @@ int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, * with preconditioner using factors LU \approx H * and forward / backward substitution */ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, real *time ) + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, real *time, real *spmv_time ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; @@ -628,7 +661,11 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, for ( itr = 0; itr < MAX_ITR; ++itr ) { /* calculate r0 */ + gettimeofday( &start, NULL ); Sparse_MatVec( H, x, workspace->b_prm ); + gettimeofday( &stop, NULL ); + *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0) + - (start.tv_sec + start.tv_usec / 1000000.0); Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); gettimeofday( &start, NULL ); Forward_Subs( L, workspace->v[0], workspace->v[0] ); @@ -644,7 +681,11 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { /* matvec */ + gettimeofday( &start, NULL ); Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); + gettimeofday( &stop, NULL ); + *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0) + - (start.tv_sec + start.tv_usec / 1000000.0); gettimeofday( &start, NULL ); Forward_Subs( L, workspace->v[j + 1], workspace->v[j + 1] ); Backward_Subs( U, workspace->v[j + 1], workspace->v[j + 1] ); @@ -750,7 +791,7 @@ int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, * with preconditioner using factors LU \approx H * and Jacobi iteration for approximate factor application */ int PGMRES_Jacobi( static_storage *workspace, sparse_matrix *H, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, real *time ) + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, real *time, real *spmv_time ) { int i, j, k, itr, N, si; real cc, tmp1, tmp2, temp, bnorm; @@ -787,7 +828,11 @@ int PGMRES_Jacobi( static_storage *workspace, sparse_matrix *H, real *b, real to for ( itr = 0; itr < MAX_ITR; ++itr ) { /* calculate r0 */ + gettimeofday( &start, NULL ); Sparse_MatVec( H, x, workspace->b_prm ); + gettimeofday( &stop, NULL ); + *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0) + - (start.tv_sec + start.tv_usec / 1000000.0); Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); // TODO: add parameters to config file gettimeofday( &start, NULL ); @@ -804,7 +849,11 @@ int PGMRES_Jacobi( static_storage *workspace, sparse_matrix *H, real *b, real to for ( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { /* matvec */ + gettimeofday( &start, NULL ); Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); + gettimeofday( &stop, NULL ); + *spmv_time += (stop.tv_sec + stop.tv_usec / 1000000.0) + - (start.tv_sec + start.tv_usec / 1000000.0); // TODO: add parameters to config file gettimeofday( &start, NULL ); Jacobi_Iter( L, LOWER, Dinv_L, N, workspace->v[j + 1], workspace->v[j + 1], 50 ); diff --git a/puremd_rc_1003/sPuReMD/GMRES.h b/puremd_rc_1003/sPuReMD/GMRES.h index 2a48525b..79618029 100644 --- a/puremd_rc_1003/sPuReMD/GMRES.h +++ b/puremd_rc_1003/sPuReMD/GMRES.h @@ -33,16 +33,16 @@ typedef enum } TRIANGULARITY; int GMRES( static_storage*, sparse_matrix*, - real*, real, real*, FILE*, real* ); + real*, real, real*, FILE*, real*, real* ); int GMRES_HouseHolder( static_storage*, sparse_matrix*, real*, real, real*, FILE* ); int PGMRES( static_storage*, sparse_matrix*, real*, real, - sparse_matrix*, sparse_matrix*, real*, FILE*, real* ); + sparse_matrix*, sparse_matrix*, real*, FILE*, real*, real* ); int PGMRES_Jacobi( static_storage*, sparse_matrix*, real*, real, - sparse_matrix*, sparse_matrix*, real*, FILE*, real* ); + sparse_matrix*, sparse_matrix*, real*, FILE*, real*, real* ); int PCG( static_storage*, sparse_matrix*, real*, real, sparse_matrix*, sparse_matrix*, real*, FILE* ); diff --git a/puremd_rc_1003/sPuReMD/init_md.c b/puremd_rc_1003/sPuReMD/init_md.c index 56610f76..aaa1a61f 100644 --- a/puremd_rc_1003/sPuReMD/init_md.c +++ b/puremd_rc_1003/sPuReMD/init_md.c @@ -213,6 +213,7 @@ void Init_Simulation_Data( reax_system *system, control_params *control, data->timing.matvecs = 0; data->timing.pre_comp = ZERO; data->timing.pre_app = ZERO; + data->timing.spmv = ZERO; } @@ -490,9 +491,9 @@ void Init_Out_Controls(reax_system *system, control_params *control, strcpy( temp, control->sim_name ); strcat( temp, ".log" ); out_control->log = fopen( temp, "w" ); - fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n", + fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s%10s%10s%10s\n", "step", "total", "neighbors", "init", "bonded", - "nonbonded", "QEq", "matvec", "pre comp", "pre app" ); + "nonbonded", "QEq", "matvec", "pre comp", "pre app", "spmv" ); } /* Init pressure file */ diff --git a/puremd_rc_1003/sPuReMD/mytypes.h b/puremd_rc_1003/sPuReMD/mytypes.h index fefe6e6f..32d95d6e 100644 --- a/puremd_rc_1003/sPuReMD/mytypes.h +++ b/puremd_rc_1003/sPuReMD/mytypes.h @@ -505,9 +505,10 @@ typedef struct real bonded; real nonb; real QEq; - int matvecs; - real pre_comp; - real pre_app; + int matvecs; + real pre_comp; + real pre_app; + real spmv; } reax_timing; diff --git a/puremd_rc_1003/sPuReMD/print_utils.c b/puremd_rc_1003/sPuReMD/print_utils.c index 4f8f25a4..710ee568 100644 --- a/puremd_rc_1003/sPuReMD/print_utils.c +++ b/puremd_rc_1003/sPuReMD/print_utils.c @@ -631,7 +631,7 @@ void Output_Results( reax_system *system, control_params *control, f_update = 1; else f_update = out_control->energy_update_freq; - fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.6f%10.6f\n", + fprintf( out_control->log, "%6d%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.2f%10.6f%10.6f%10.6f\n", data->step, t_elapsed / f_update, data->timing.nbrs / f_update, data->timing.init_forces / f_update, @@ -640,7 +640,8 @@ void Output_Results( reax_system *system, control_params *control, data->timing.QEq / f_update, (double)data->timing.matvecs / f_update, data->timing.pre_comp / f_update, - data->timing.pre_app / f_update ); + data->timing.pre_app / f_update, + data->timing.spmv / f_update ); data->timing.total = Get_Time( ); data->timing.nbrs = 0; @@ -651,6 +652,7 @@ void Output_Results( reax_system *system, control_params *control, data->timing.matvecs = 0; data->timing.pre_comp = ZERO; data->timing.pre_app = ZERO; + data->timing.spmv = ZERO; fflush( out_control->out ); fflush( out_control->pot ); diff --git a/puremd_rc_1003/sPuReMD/vector.c b/puremd_rc_1003/sPuReMD/vector.c index b5eef5c6..bdbe19da 100644 --- a/puremd_rc_1003/sPuReMD/vector.c +++ b/puremd_rc_1003/sPuReMD/vector.c @@ -94,17 +94,6 @@ inline void Vector_Add( real * const dest, const real c, const real * const v, c } -inline void Vector_Add2( real * const dest, const real * const v, const unsigned int k ) -{ - unsigned int i; - - for ( i = 0; i < k; ++i ) - { - dest[i] += v[i]; - } -} - - void Vector_Print( FILE * const fout, const char * const vname, const real * const v, const unsigned int k ) { diff --git a/puremd_rc_1003/sPuReMD/vector.h b/puremd_rc_1003/sPuReMD/vector.h index 68fd4892..98ba7dd8 100644 --- a/puremd_rc_1003/sPuReMD/vector.h +++ b/puremd_rc_1003/sPuReMD/vector.h @@ -30,7 +30,6 @@ void Vector_Copy( real * const, const real * const, const unsigned int ); void Vector_Scale( real * const, const real, const real * const, const unsigned int ); void Vector_Sum( real * const, const real, const real * const, const real, const real * const, const unsigned int ); void Vector_Add( real * const, const real, const real * const, const unsigned int ); -void Vector_Add2( real * const, const real * const, const unsigned int ); void Vector_Print( FILE * const, const char * const, const real * const, const unsigned int ); real Dot( const real * const, const real * const, const unsigned int ); real Norm( const real * const, const unsigned int ); -- GitLab