diff --git a/sPuReMD/src/analyze.c b/sPuReMD/src/analyze.c index 4e2cbe2cfa6f151fa1014f4544c08d476298d0e9..9fcd54f757bdb0682cb3a327bd9d8770124c9354 100644 --- a/sPuReMD/src/analyze.c +++ b/sPuReMD/src/analyze.c @@ -191,7 +191,8 @@ void Get_Molecule( int atom, molecule *m, int *mark, reax_system *system, } -void Print_Molecule( reax_system *system, molecule *m, int mode, char *s ) +void Print_Molecule( reax_system *system, molecule *m, int mode, char *s, + size_t size ) { int j, atom; @@ -202,7 +203,7 @@ void Print_Molecule( reax_system *system, molecule *m, int mode, char *s ) /* print molecule summary */ for ( j = 0; j < MAX_ATOM_TYPES; ++j ) if ( m->mtypes[j] ) - sprintf( s, "%s%s%d", s, system->reaxprm.sbp[j].name, m->mtypes[j] ); + snprintf( s, size, "%s%s%d", s, system->reaxprm.sbp[j].name, m->mtypes[j] ); } else if ( mode == 2 ) { @@ -210,7 +211,7 @@ void Print_Molecule( reax_system *system, molecule *m, int mode, char *s ) for ( j = 0; j < m->atom_count; ++j ) { atom = m->atom_list[j]; - sprintf( s, "%s%s(%d)", + snprintf( s, size, "%s%s(%d)", s, system->reaxprm.sbp[ system->atoms[atom].type ].name, atom ); } } @@ -288,7 +289,8 @@ void Analyze_Molecules( reax_system *system, control_params *control, fprintf( fout, "old molecules: " ); for ( i = 0; i < num_old; ++i ) { - Print_Molecule( system, &old_molecules[i], 1, &s[0] ); + Print_Molecule( system, &old_molecules[i], 1, &s[0], + MAX_MOLECULE_SIZE * 10 ); fprintf( fout, "%s\t", s ); } fprintf( fout, "\n" ); @@ -296,7 +298,8 @@ void Analyze_Molecules( reax_system *system, control_params *control, fprintf( fout, "new molecules: " ); for ( i = 0; i < num_new; ++i ) { - Print_Molecule( system, &new_molecules[i], 1, &s[0] ); + Print_Molecule( system, &new_molecules[i], 1, &s[0], + MAX_MOLECULE_SIZE * 10 ); fprintf( fout, "%s\t", s ); } fprintf( fout, "\n" ); @@ -528,15 +531,15 @@ void Visit_Bonds( int atom, int *mark, int *type, reax_system *system, void Analyze_Fragments( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - reax_list **lists, FILE *fout, int ignore ) + simulation_data *data, static_storage *workspace, + reax_list **lists, FILE *fout, int ignore ) { - int atom, i, flag; - int *mark = workspace->mark; - int num_fragments, num_fragment_types; + int atom, i, flag; + int *mark = workspace->mark; + int num_fragments, num_fragment_types; char fragment[MAX_ATOM_TYPES]; char fragments[MAX_FRAGMENT_TYPES][MAX_ATOM_TYPES]; - int fragment_count[MAX_FRAGMENT_TYPES]; + int fragment_count[MAX_FRAGMENT_TYPES]; molecule m; reax_list *new_bonds = (*lists) + BONDS; //reax_list *old_bonds = (*lists) + OLD_BONDS; @@ -554,7 +557,7 @@ void Analyze_Fragments( reax_system *system, control_params *control, memset( m.mtypes, 0, MAX_ATOM_TYPES * sizeof(int) ); Visit_Bonds( atom, mark, m.mtypes, system, control, new_bonds, ignore ); ++num_fragments; - Print_Molecule( system, &m, 1, fragment ); + Print_Molecule( system, &m, 1, fragment, MAX_FRAGMENT_TYPES ); /* check if a similar fragment already exists */ flag = 0; diff --git a/sPuReMD/src/charges.c b/sPuReMD/src/charges.c index 9276c82d5007d707c84dcd10906f94e58c5f4ae9..497da262d05d1235c29650b7424b4704383cfa96 100644 --- a/sPuReMD/src/charges.c +++ b/sPuReMD/src/charges.c @@ -203,9 +203,14 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, break; case SAI_PC: - //TODO: call function +#if defined(HAVE_LAPACK) data->timing.cm_solver_pre_comp += - SAI_PAR( Hptr, HSPptr, H_AppInv ); + Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + &workspace->H_app_inv ); +#else + fprintf( stderr, "LAPACK support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); +#endif break; default: @@ -215,18 +220,22 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, } #if defined(DEBUG) +#define SIZE (1000) + char fname[SIZE]; + if ( control->cm_solver_pre_comp_type != NONE_PC && control->cm_solver_pre_comp_type != DIAG_PC ) { fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) - sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname, NULL ); - sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname, NULL ); #endif } +#undef SIZE #endif } @@ -455,22 +464,26 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, // } // //#if defined(DEBUG) +//#define SIZE (1000) +// char fname[SIZE]; +// // if ( control->cm_solver_pre_comp_type != DIAG_PC ) // { // fprintf( stderr, "condest = %f\n", condest(workspace->L) ); // //#if defined(DEBUG_FOCUS) -// sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); +// snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); // Print_Sparse_Matrix2( workspace->L, fname, NULL ); -// sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); +// snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); // Print_Sparse_Matrix2( workspace->U, fname, NULL ); // // fprintf( stderr, "icholt-" ); -// sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); +// snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); // Print_Sparse_Matrix2( workspace->L, fname, NULL ); // Print_Sparse_Matrix( U ); //#endif // } +//#undef SIZE //#endif //} @@ -553,9 +566,14 @@ static void Compute_Preconditioner_EE( const reax_system * const system, break; case SAI_PC: - //TODO: call function +#if defined(HAVE_LAPACK) data->timing.cm_solver_pre_comp += - SAI_PAR( Hptr, HSPptr, H_AppInv ); + Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + &workspace->H_app_inv ); +#else + fprintf( stderr, "LAPACK support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); +#endif break; default: @@ -567,18 +585,22 @@ static void Compute_Preconditioner_EE( const reax_system * const system, Hptr->val[Hptr->start[system->N + 1] - 1] = 0.0; #if defined(DEBUG) +#define SIZE (1000) + char fname[SIZE]; + if ( control->cm_solver_pre_comp_type != NONE_PC && control->cm_solver_pre_comp_type != DIAG_PC ) { fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) - sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname, NULL ); - sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname, NULL ); #endif } +#undef SIZE #endif } @@ -662,9 +684,14 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, break; case SAI_PC: - //TODO: call function +#if defined(HAVE_LAPACK) data->timing.cm_solver_pre_comp += - SAI_PAR( Hptr, HSPptr, H_AppInv ); + Sparse_Approx_Inverse( Hptr, workspace->H_spar_patt, + &workspace->H_app_inv ); +#else + fprintf( stderr, "LAPACK support disabled. Re-compile before enabling. Terminating...\n" ); + exit( INVALID_INPUT ); +#endif break; default: @@ -677,18 +704,22 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, Hptr->val[Hptr->start[system->N_cm] - 1] = 0.0; #if defined(DEBUG) +#define SIZE (1000) + char fname[SIZE]; + if ( control->cm_solver_pre_comp_type != NONE_PC || control->cm_solver_pre_comp_type != DIAG_PC ) { fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) - sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname, NULL ); - sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); + snprintf( fname, SIZE + 10, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname, NULL ); #endif } +#undef SIZE #endif } @@ -830,8 +861,8 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, break; case SAI_PC: - //TODO: call setup function - Setup_Sparsity_Pattern( Hptr, workspace->saifilter, HSPptr ); + Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, + workspace->H_spar_patt ); break; default: @@ -982,8 +1013,8 @@ static void Setup_Preconditioner_EE( const reax_system * const system, break; case SAI_PC: - //TODO: call setup function - Setup_Sparsity_Pattern( Hptr, workspace->saifilter, HSPptr ); + Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, + workspace->H_spar_patt ); break; default: @@ -1136,8 +1167,8 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, break; case SAI_PC: - //TODO: call setup function - Setup_Sparsity_Pattern( Hptr, workspace->saifilter, HSPptr ); + Setup_Sparsity_Pattern( Hptr, control->cm_solver_pre_comp_sai_thres, + workspace->H_spar_patt ); break; default: @@ -1437,17 +1468,18 @@ void Compute_Charges( reax_system * const system, control_params * const control const reax_list * const far_nbrs, const output_controls * const out_control ) { #if defined(DEBUG_FOCUS) - char fname[200]; +#define SIZE (200) + char fname[SIZE]; FILE * fp; if ( data->step >= 100 ) { - sprintf( fname, "s_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "s_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->s[0], system->N_cm ); fclose( fp ); - sprintf( fname, "t_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "t_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->t[0], system->N_cm ); fclose( fp ); @@ -1477,19 +1509,20 @@ void Compute_Charges( reax_system * const system, control_params * const control #if defined(DEBUG_FOCUS) if ( data->step >= 100 ) { - sprintf( fname, "H_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "H_%d_%s.out", data->step, control->sim_name ); Print_Sparse_Matrix2( workspace->H, fname, NULL ); // Print_Sparse_Matrix_Binary( workspace->H, fname ); - sprintf( fname, "b_s_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "b_s_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->b_s, system->N_cm ); fclose( fp ); - sprintf( fname, "b_t_%d_%s.out", data->step, control->sim_name ); + snprintf( fname, SIZE + 11, "b_t_%d_%s.out", data->step, control->sim_name ); fp = fopen( fname, "w" ); Vector_Print( fp, NULL, workspace->b_t, system->N_cm ); fclose( fp ); } +#undef SIZE #endif } diff --git a/sPuReMD/src/control.c b/sPuReMD/src/control.c index ead90358b818f6428750e90f646534fb8f825d70..4f5616ab0d983290d219c57d92544a752d1dcba2 100644 --- a/sPuReMD/src/control.c +++ b/sPuReMD/src/control.c @@ -79,6 +79,7 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control, control->cm_domain_sparsity = 1.0; control->cm_solver_pre_comp_type = ICHOLT_PC; control->cm_solver_pre_comp_sweeps = 3; + control->cm_solver_pre_comp_sai_thres = 0.1; control->cm_solver_pre_comp_refactor = 100; control->cm_solver_pre_comp_droptol = 0.01; control->cm_solver_pre_app_type = TRI_SOLVE_PA; @@ -305,6 +306,11 @@ char Read_Control_File( FILE* fp, reax_system *system, control_params* control, ival = atoi( tmp[1] ); control->cm_solver_pre_comp_sweeps = ival; } + else if ( strcmp(tmp[0], "cm_solver_pre_comp_sai_thres") == 0 ) + { + val = atof( tmp[1] ); + control->cm_solver_pre_comp_sai_thres = val; + } else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 ) { ival = atoi( tmp[1] ); diff --git a/sPuReMD/src/geo_tools.c b/sPuReMD/src/geo_tools.c index 48eb0dbba811832e1797750181f2bd4595beea45..fcde21be8b93a72299b3b7134978d05cef42d83c 100644 --- a/sPuReMD/src/geo_tools.c +++ b/sPuReMD/src/geo_tools.c @@ -482,7 +482,7 @@ char Write_PDB( reax_system* system, reax_list* bonds, simulation_data *data, (system->box.box_norms[2] * system->box.box_norms[1]) ); /*open pdb and write header*/ - sprintf( fname, "%s-%d.pdb", control->sim_name, data->step ); + snprintf( fname, MAX_STR + 9, "%s-%d.pdb", control->sim_name, data->step ); pdb = fopen(fname, "w"); fprintf( pdb, PDB_CRYST1_FORMAT_O, "CRYST1", @@ -498,7 +498,7 @@ char Write_PDB( reax_system* system, reax_list* bonds, simulation_data *data, p_atom = &(system->atoms[i]); strncpy(name, p_atom->name, 8); Trim_Spaces(name); - sprintf( line, PDB_ATOM_FORMAT_O, + snprintf( line, MAX_STR, PDB_ATOM_FORMAT_O, "ATOM ", workspace->orig_id[i], p_atom->name, ' ', "REX", ' ', 1, ' ', p_atom->x[0], p_atom->x[1], p_atom->x[2], 1.0, 0.0, "0", name, " " ); diff --git a/sPuReMD/src/init_md.c b/sPuReMD/src/init_md.c index d4175d494337f5864b57870cb09918f373002db7..fc9c46bac8d1888658aab60b2b436e969ea311f8 100644 --- a/sPuReMD/src/init_md.c +++ b/sPuReMD/src/init_md.c @@ -638,7 +638,8 @@ void Init_Lists( reax_system *system, control_params *control, void Init_Out_Controls( reax_system *system, control_params *control, static_storage *workspace, output_controls *out_control ) { - char temp[1000]; +#define TEMP_SIZE (1000) + char temp[TEMP_SIZE]; /* Init trajectory file */ if ( out_control->write_steps > 0 ) @@ -697,11 +698,11 @@ void Init_Out_Controls( reax_system *system, control_params *control, /* Init molecular analysis file */ if ( control->molec_anal ) { - sprintf( temp, "%s.mol", control->sim_name ); + snprintf( temp, TEMP_SIZE + 4, "%s.mol", control->sim_name ); out_control->mol = fopen( temp, "w" ); if ( control->num_ignored ) { - sprintf( temp, "%s.ign", control->sim_name ); + snprintf( temp, TEMP_SIZE + 4, "%s.ign", control->sim_name ); out_control->ign = fopen( temp, "w" ); } } @@ -858,6 +859,8 @@ void Init_Out_Controls( reax_system *system, control_params *control, fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." ); exit( CANNOT_OPEN_FILE ); }*/ + +#undef TEMP_SIZE } diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index bb17a65bb31204a3fcad78e24d7175d3fd4d1c9c..2b3bc691e1b01f1784837b56a5a9230c44539a68 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -24,7 +24,16 @@ #include "allocate.h" #include "tool_box.h" #include "vector.h" + +#if defined(HAVE_LAPACK) +/* Intel MKL */ +#if defined(HAVE_LAPACK_MKL) #include "mkl_lapacke.h" +/* reference LAPACK */ +#else +#include "lapacke.h" +#endif +#endif typedef struct { @@ -176,29 +185,102 @@ void Sort_Matrix_Rows( sparse_matrix * const A ) } -void Setup_Sparsity_Pattern(const sparse_matrix * const A, - real * const saifilter, sparse_matrix * A_SP) +/* Convert a symmetric, half-sored sparse matrix into + * a full-stored sparse matrix + * + * A: symmetric sparse matrix, lower half stored in CSR + * A_full: resultant full sparse matrix in CSR + * If A_full is NULL, allocate space, otherwise do not + * + * Assumptions: + * A has non-zero diagonals + * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ +static void compute_full_sparse_matrix( const sparse_matrix * const A, + sparse_matrix ** A_full ) +{ + int count, i, pj; + sparse_matrix *A_t; + + if ( *A_full == NULL ) + { + if ( Allocate_Matrix( A_full, A->n, 2 * A->m - A->n ) == FAILURE ) + { + fprintf( stderr, "not enough memory for full A. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + if ( Allocate_Matrix( &A_t, A->n, A->m ) == FAILURE ) + { + fprintf( stderr, "not enough memory for full A. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + + /* Set up the sparse matrix data structure for A. */ + Transpose( A, A_t ); + + count = 0; + for ( i = 0; i < A->n; ++i ) + { + (*A_full)->start[i] = count; + + /* A: symmetric, lower triangular portion only stored */ + for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) + { + (*A_full)->val[count] = A->val[pj]; + (*A_full)->j[count] = A->j[pj]; + ++count; + } + /* A^T: symmetric, upper triangular portion only stored; + * skip diagonal from A^T, as included from A above */ + for ( pj = A_t->start[i] + 1; pj < A_t->start[i + 1]; ++pj ) + { + (*A_full)->val[count] = A_t->val[pj]; + (*A_full)->j[count] = A_t->j[pj]; + ++count; + } + } + (*A_full)->start[i] = count; + + Deallocate_Matrix( A_t ); +} + + +/* Setup routines for sparse approximate inverse preconditioner + * + * A: symmetric sparse matrix, lower half stored in CSR + * filter: + * A_spar_patt: + * + * Assumptions: + * A has non-zero diagonals + * Each row of A has at least one non-zero (i.e., no rows with all zeros) */ +void Setup_Sparsity_Pattern( const sparse_matrix * const A, + const real filter, sparse_matrix * A_spar_patt ) { int i, pj, size; - real mn, mx, threshold, val; + real min, max, threshold, val; + + min = 0.0; + max = 0.0; - if( A_SP == NULL ) + if ( A_spar_patt == NULL ) { - if( Allocate_Matrix( &A_SP, A->n, A->m ) == NULL ) + if ( Allocate_Matrix( &A_spar_patt, A->n, A->m ) == FAILURE ) { fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); exit( INSUFFICIENT_MEMORY ); } } - else if( (A_SP->m) < (A->m) ) + else if ( (A_spar_patt->m) < (A->m) ) { - Deallocate_Matrix( A_SP ); - if( Allocate_Matrix( &A_SP, A->n, A->m ) == NULL ) + Deallocate_Matrix( A_spar_patt ); + if ( Allocate_Matrix( &A_spar_patt, A->n, A->m ) == FAILURE ) { fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); exit( INSUFFICIENT_MEMORY ); } } + // find min and max element of the matrix for ( i = 0; i < A->n; ++i ) { @@ -207,58 +289,63 @@ void Setup_Sparsity_Pattern(const sparse_matrix * const A, val = A->val[pj]; if ( pj == 0 ) { - mn = mx = val; + min = val; + max = val; } else { - if ( mn > val ) - mn = val; - if ( mx < val ) - mx = val; + if ( min > val ) + { + min = val; + } + if ( max < val ) + { + max = val; + } } } } - threshold = mn + ( mx - mn ) * saifilter; + threshold = min + ( max - min ) * filter; // calculate the nnz of the sparsity pattern - /* for ( size = 0, i = 0; i < A->n; ++i ) - { - for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) - { - if ( threshold <= A->val[pj] ) - size++; - } - }*/ - - /*if( Allocate_Matrix( &A_SP, A->n, size ) == NULL ) - { - fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - }*/ - - //A_SP->start[A_SP->n] = size; +// for ( size = 0, i = 0; i < A->n; ++i ) +// { +// for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) +// { +// if ( threshold <= A->val[pj] ) +// size++; +// } +// } +// +// if ( Allocate_Matrix( &A_spar_patt, A->n, size ) == NULL ) +// { +// fprintf( stderr, "[SAI] Not enough memory for preconditioning matrices. terminating.\n" ); +// exit( INSUFFICIENT_MEMORY ); +// } + + //A_spar_patt->start[A_spar_patt->n] = size; // fill the sparsity pattern for ( size = 0, i = 0; i < A->n; ++i ) { - A_SP->start[i] = size; + A_spar_patt->start[i] = size; for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) { if ( threshold <= A->val[pj] ) { - A_SP->val[size] = A->val[pj]; - A_SP->j[size] = A->j[pj]; + A_spar_patt->val[size] = A->val[pj]; + A_spar_patt->j[size] = A->j[pj]; size++; } } } - A_SP->start[A->n] = size; + A_spar_patt->start[A->n] = size; } void Calculate_Droptol( const sparse_matrix * const A, - real * const droptol, const real dtol ) + real * const droptol, const real dtol ) { int i, j, k; real val; @@ -268,13 +355,13 @@ void Calculate_Droptol( const sparse_matrix * const A, #endif #ifdef _OPENMP -#pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) + #pragma omp parallel default(none) private(i, j, k, val, tid), shared(droptol_local, stderr) #endif { #ifdef _OPENMP tid = omp_get_thread_num(); -#pragma omp master + #pragma omp master { /* keep b_local for program duration to avoid allocate/free * overhead per Sparse_MatVec call*/ @@ -288,7 +375,7 @@ void Calculate_Droptol( const sparse_matrix * const A, } } -#pragma omp barrier + #pragma omp barrier #endif /* init droptol to 0 */ @@ -302,12 +389,12 @@ void Calculate_Droptol( const sparse_matrix * const A, } #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier #endif /* calculate sqaure of the norm of each row */ #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -335,9 +422,9 @@ void Calculate_Droptol( const sparse_matrix * const A, } #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier -#pragma omp for schedule(static) + #pragma omp for schedule(static) for ( i = 0; i < A->n; ++i ) { droptol[i] = 0.0; @@ -347,13 +434,13 @@ void Calculate_Droptol( const sparse_matrix * const A, } } -#pragma omp barrier + #pragma omp barrier #endif /* calculate local droptol for each row */ //fprintf( stderr, "droptol: " ); #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -375,7 +462,7 @@ int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol fillin = 0; #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) private(i, pj, val) reduction(+: fillin) #endif for ( i = 0; i < A->n; ++i ) @@ -397,7 +484,7 @@ int Estimate_LU_Fill( const sparse_matrix * const A, const real * const droptol #if defined(HAVE_SUPERLU_MT) real SuperLU_Factorize( const sparse_matrix * const A, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, pj, count, *Ltop, *Utop, r; sparse_matrix *A_t; @@ -426,10 +513,10 @@ real SuperLU_Factorize( const sparse_matrix * const A, /* Default parameters to control factorization. */ #ifdef _OPENMP //TODO: set as global parameter and use -#pragma omp parallel \ + #pragma omp parallel \ default(none) shared(nprocs) { -#pragma omp master + #pragma omp master { /* SuperLU_MT spawns threads internally, so set and pass parameter */ nprocs = omp_get_num_threads(); @@ -452,11 +539,11 @@ real SuperLU_Factorize( const sparse_matrix * const A, work = NULL; lwork = 0; - //#if defined(DEBUG) +#if defined(DEBUG) fprintf( stderr, "nprocs = %d\n", nprocs ); fprintf( stderr, "Panel size = %d\n", panel_size ); fprintf( stderr, "Relax = %d\n", relax ); - //#endif +#endif if ( !(perm_r = intMalloc(A->n)) ) { @@ -516,7 +603,7 @@ real SuperLU_Factorize( const sparse_matrix * const A, xa[i] = count; dCompRow_to_CompCol( A->n, A->n, 2 * A->start[A->n] - A->n, a, asub, xa, - &at, &atsub, &xat ); + &at, &atsub, &xat ); for ( i = 0; i < (2 * A->start[A->n] - A->n); ++i ) fprintf( stderr, "%6d", asub[i] ); @@ -571,8 +658,8 @@ real SuperLU_Factorize( const sparse_matrix * const A, Apply perm_c to the columns of original A to form AC. ------------------------------------------------------------*/ pdgstrf_init( nprocs, fact, trans, refact, panel_size, relax, - u, usepr, drop_tol, perm_c, perm_r, - work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); + u, usepr, drop_tol, perm_c, perm_r, + work, lwork, &A_S, &AC_S, &superlumt_options, &Gstat ); for ( i = 0; i < ((NCPformat*)AC_S.Store)->nnz; ++i ) fprintf( stderr, "%6.1f", ((real*)(((NCPformat*)AC_S.Store)->nzval))[i] ); @@ -593,14 +680,14 @@ real SuperLU_Factorize( const sparse_matrix * const A, } Gstat.ops[FACT] = flopcnt; - //#if defined(DEBUG) +#if defined(DEBUG) printf("\n** Result of sparse LU **\n"); L_S_store = (SCPformat *) L_S.Store; U_S_store = (NCPformat *) U_S.Store; printf( "No of nonzeros in factor L = " IFMT "\n", L_S_store->nnz ); printf( "No of nonzeros in factor U = " IFMT "\n", U_S_store->nnz ); fflush( stdout ); - //#endif +#endif /* convert L and R from SuperLU formats to CSR */ memset( Ltop, 0, (A->n + 1) * sizeof(int) ); @@ -702,7 +789,7 @@ real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) start = Get_Time( ); #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) private(i) #endif for ( i = 0; i < H->n; ++i ) @@ -723,7 +810,7 @@ real diag_pre_comp( const sparse_matrix * const H, real * const Hdia_inv ) /* Incomplete Cholesky factorization with dual thresholding */ real ICHOLT( const sparse_matrix * const A, const real * const droptol, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { int *tmp_j; real *tmp_val; @@ -876,7 +963,7 @@ real ICHOLT( const sparse_matrix * const A, const real * const droptol, * SIAM J. Sci. Comp. */ #if defined(TESTING) real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const U_t, sparse_matrix * const U ) + sparse_matrix * const U_t, sparse_matrix * const U ) { unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; real *D, *D_inv, sum, start; @@ -895,7 +982,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(D_inv, D) private(i) #endif for ( i = 0; i < A->n; ++i ) @@ -911,7 +998,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, * transformation DAD, where D = D(1./SQRT(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #ifdef _OPENMP -#pragma omp parallel for schedule(guided) \ + #pragma omp parallel for schedule(guided) \ default(none) shared(DAD, D_inv, D) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) @@ -937,7 +1024,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, stderr) private(sum, ei_x, ei_y, k) firstprivate(x, y) #endif for ( j = 0; j < A->start[A->n]; ++j ) @@ -992,7 +1079,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, fprintf( stderr, "Numeric breakdown in ICHOL_PAR. Terminating.\n"); #if defined(DEBUG_FOCUS) fprintf( stderr, "A(%5d,%5d) = %10.3f\n", - k - 1, A->entries[j].j, A->entries[j].val ); + k - 1, A->entries[j].j, A->entries[j].val ); fprintf( stderr, "sum = %10.3f\n", sum); #endif exit(NUMERIC_BREAKDOWN); @@ -1012,7 +1099,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, * since DAD \approx U^{T}U, so * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ #ifdef _OPENMP -#pragma omp parallel for schedule(guided) \ + #pragma omp parallel for schedule(guided) \ default(none) shared(D_inv) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) @@ -1055,7 +1142,7 @@ real ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, * sweeps: number of loops over non-zeros for computation * L / U: factorized triangular matrices (A \approx LU), CSR format */ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, - sparse_matrix * const L, sparse_matrix * const U ) + sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, j, k, pj, x, y, ei_x, ei_y; real *D, *D_inv, sum, start; @@ -1072,7 +1159,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(D, D_inv) private(i) #endif for ( i = 0; i < A->n; ++i ) @@ -1086,7 +1173,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, * transformation DAD, where D = D(1./SQRT(abs(D(A)))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, D) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) @@ -1114,7 +1201,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, /* L has unit diagonal, by convention */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) default(none) private(i) + #pragma omp parallel for schedule(static) default(none) private(i) #endif for ( i = 0; i < A->n; ++i ) { @@ -1125,7 +1212,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, { /* for each nonzero in L */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) @@ -1177,7 +1264,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, } #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) @@ -1230,7 +1317,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, * since DAD \approx LU, then * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, D_inv) private(i, pj) #endif for ( i = 0; i < DAD->n; ++i ) @@ -1270,7 +1357,7 @@ real ILU_PAR( const sparse_matrix * const A, const unsigned int sweeps, * sweeps: number of loops over non-zeros for computation * L / U: factorized triangular matrices (A \approx LU), CSR format */ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, - const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) + const unsigned int sweeps, sparse_matrix * const L, sparse_matrix * const U ) { unsigned int i, j, k, pj, x, y, ei_x, ei_y, Ltop, Utop; real *D, *D_inv, sum, start; @@ -1294,7 +1381,7 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, } #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(D, D_inv) private(i) #endif for ( i = 0; i < A->n; ++i ) @@ -1307,7 +1394,7 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, * transformation DAD, where D = D(1./SQRT(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, D) private(i, pj) #endif for ( i = 0; i < A->n; ++i ) @@ -1335,7 +1422,7 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, /* L has unit diagonal, by convention */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) private(i) shared(L_temp) #endif for ( i = 0; i < A->n; ++i ) @@ -1347,7 +1434,7 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, { /* for each nonzero in L */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) @@ -1399,7 +1486,7 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, } #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, L_temp, U_temp) private(j, k, x, y, ei_x, ei_y, sum) #endif for ( j = 0; j < DAD->start[DAD->n]; ++j ) @@ -1452,7 +1539,7 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, * since DAD \approx LU, then * D^{-1}DADD^{-1} = A \approx D^{-1}LUD^{-1} */ #ifdef _OPENMP -#pragma omp parallel for schedule(static) \ + #pragma omp parallel for schedule(static) \ default(none) shared(DAD, L_temp, U_temp, D_inv) private(i, pj) #endif for ( i = 0; i < DAD->n; ++i ) @@ -1523,59 +1610,65 @@ real ILUT_PAR( const sparse_matrix * const A, const real * droptol, return Get_Timing_Info( start ); } -real SAI_PAR( const sparse_matrix * const A, const sparse_matrix * const A_SP, - sparse_matrix * A_AppInv) + +#if defined(HAVE_LAPACK) +real Sparse_Approx_Inverse( const sparse_matrix * const A, + const sparse_matrix * const A_spar_patt, sparse_matrix ** A_app_inv ) { - int i, j, k, pj, j_temp, identity_pos; + int i, k, pj, j_temp, identity_pos; int N, M, d_i, d_j; +#if defined(HAVE_LAPACK_MKL) MKL_int m, n, nrhs, lda, ldb, info; - int * pos_i; - int * pos_j; +#else + int m, n, nrhs, lda, ldb, info; +#endif + int *pos_i, *pos_j; real start; - real * e_j; - real * dense_matrix; - sparse_matrix * A_full; - sparse_matrix * A_SP_full; - char * I; - char * J; + real *e_j, *dense_matrix; + sparse_matrix *A_full, *A_spar_patt_full; + char *I, *J; start = Get_Time( ); - - if( (I = (char *) malloc(sizeof(char) * A->n)) == NULL || - (J = (char *) malloc(sizeof(char) * A->n)) == NULL || - (pos_i = (int *) malloc(sizeof(int) * A->n)) == NULL || - (pos_j = (int *) malloc(sizeof(int) * A->n)) == NULL) + if ( (I = (char *) smalloc(sizeof(char) * A->n, "Sparse_Approx_Inverse::I")) == NULL || + (J = (char *) smalloc(sizeof(char) * A->n, "Sparse_Approx_Inverse::I")) == NULL || + (pos_i = (int *) smalloc(sizeof(int) * A->n, "Sparse_Approx_Inverse::I")) == NULL || + (pos_j = (int *) smalloc(sizeof(int) * A->n, "Sparse_Approx_Inverse::I")) == NULL ) { exit( INSUFFICIENT_MEMORY ); } - for( i = 0; i < A->n; ++i ) - I[i] = J[i] = pos_i[i] = pos_j[i] = 0; + for ( i = 0; i < A->n; ++i ) + { + I[i] = 0; + J[i] = 0; + pos_i[i] = 0; + pos_j[i] = 0; + } - // Get A and A_SP full matrices - compute_H_full_for_SAI( A, A_full ); - compute_H_full_for_SAI( A_SP, A_SP_full ); + // Get A and A_spar_patt full matrices + compute_full_sparse_matrix( A, &A_full ); + compute_full_sparse_matrix( A_spar_patt, &A_spar_patt_full ); - // A_AppInv will be the same as A_SP_full except the val array - if ( Allocate_Matrix( &A_AppInv, A_SP_full->n, A_SP_full->m ) == FAILURE ) + // A_app_inv will be the same as A_spar_patt_full except the val array + if ( Allocate_Matrix( A_app_inv, A_spar_patt_full->n, A_spar_patt_full->m ) == FAILURE ) { fprintf( stderr, "not enough memory for approximate inverse matrix. terminating.\n" ); exit( INSUFFICIENT_MEMORY ); } - A_AppInv->start[A_AppInv->n] = A_SP_full->start[A_SP_full->n]; + (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt_full->start[A_spar_patt_full->n]; - // For each row of full A_SP - for( i = 0; i < A_SP_full->n; ++i ) + // For each row of full A_spar_patt + for ( i = 0; i < A_spar_patt_full->n; ++i ) { - - // N = A_SP_full->start[i + 1] - A_SP_full->start[i]; - N = M = 0; + // N = A_spar_patt_full->start[i + 1] - A_spar_patt_full->start[i]; + N = 0; + M = 0; // find column indices of nonzeros (which will be the columns indices of the dense matrix) - for( pj = A_SP_full->start[i]; pj < A_SP_full->start[i + 1]; ++pj ) + for ( pj = A_spar_patt_full->start[i]; pj < A_spar_patt_full->start[i + 1]; ++pj ) { - j_temp = A_SP_full->j[pj]; + j_temp = A_spar_patt_full->j[pj]; J[j_temp] = 1; pos_j[j_temp] = N; @@ -1583,28 +1676,31 @@ real SAI_PAR( const sparse_matrix * const A, const sparse_matrix * const A_SP, // for each of those indices // search through the row of full A of that index - for( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k ) + for ( k = A_full->start[j_temp]; k < A_full->start[j_temp + 1]; ++k ) { // and accumulate the nonzero column indices to serve as the row indices of the dense matrix I[A_full->j[k]] = 1; } } - // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix + // enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix identity_pos = M; - for( k = 0; k < A_full->n; k++) + for ( k = 0; k < A_full->n; k++) { - if( I[k] != 0 ) + if ( I[k] != 0 ) { pos_i[M] = k; - if( k == i ) + if ( k == i ) + { identity_pos = M; + } ++M; } } // allocate memory for NxM dense matrix - if( (dense_matrix = (real *) malloc(sizeof(real) * N * M)) == NULL ) + if ( (dense_matrix = (real *) smalloc(sizeof(real) * N * M, + "Sparse_Approx_Inverse::dense_matrix")) == NULL ) { exit( INSUFFICIENT_MEMORY ); } @@ -1613,74 +1709,89 @@ real SAI_PAR( const sparse_matrix * const A, const sparse_matrix * const A_SP, for ( d_i = 0; d_i < M; ++d_i) { // all rows are initialized to zero - for( d_j = 0; d_j < N; ++d_j ) - dense_matrix[d_i][d_j] = 0.0; + for ( d_j = 0; d_j < N; ++d_j ) + { + dense_matrix[d_i * M + d_j] = 0.0; + } // change the value if any of the column indices is seen - for ( d_j = A_full->start_[pos_i[d_i]]; d_j < A_full->start[pos_i[d_i + 1]]; ++d_j) + for ( d_j = A_full->start[pos_i[d_i]]; + d_j < A_full->start[pos_i[d_i + 1]]; ++d_j ) { - if( J[A_full->j[d_j]] == 1 ) + if ( J[A_full->j[d_j]] == 1 ) { - dense_matrix[d_i][pos_j[d_j]] = A_full->val[d_j]; + dense_matrix[d_i * M + pos_j[d_j]] = A_full->val[d_j]; } } } - /* create the right hand side of the linear equation that is the full column of the identity matrix*/ - if( (e_j = (int *) malloc(sizeof(char) * M)) == NULL ) + if ( (e_j = (real *) smalloc(sizeof(char) * M, + "Sparse_Approx_Inverse::M")) == NULL ) { exit( INSUFFICIENT_MEMORY ); } - for( k = 0; k < M; ++k ) + for ( k = 0; k < M; ++k ) { - e_j[k] = 0; + e_j[k] = 0.0; } e_j[identity_pos] = 1.0; // call QR-decompostion from LAPACK - m = M, n = N, nrhs = 1, lda = N, ldb = 1; + m = M; + n = N; + nrhs = 1; + lda = N; + ldb = 1; /* Executable statements */ // printf( "LAPACKE_dgels (row-major, high-level) Example Program Results\n" ); /* Solve the equations A*X = B */ info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda, - e_j, ldb ); + e_j, ldb ); /* Check for the full rank */ - if( info > 0 ) { + if ( info > 0 ) + { fprintf( stderr, "The diagonal element %i of the triangular factor ", info ); fprintf( stderr, "of A is zero, so that A does not have full rank;\n" ); fprintf( stderr, "the least squares solution could not be computed.\n" ); - exit( 1 ); + exit( INVALID_INPUT ); } /* Print least squares solution */ // print_matrix( "Least squares solution", n, nrhs, b, ldb ); - // accumulate the resulting vector to build A_AppInv - A_AppInv->start[i] = A_SP_full->start[i]; - for( k = A_SP_full->start[i]; k < A_SP_full->start[i + 1]; ++k) + // accumulate the resulting vector to build A_app_inv + (*A_app_inv)->start[i] = A_spar_patt_full->start[i]; + for ( k = A_spar_patt_full->start[i]; k < A_spar_patt_full->start[i + 1]; ++k) { - A_Appinv->j[k] = A_SP_full->j[k]; - A_Appinv->val[k] = e_j[k - A_SP_full->start[i]]; + (*A_app_inv)->j[k] = A_spar_patt_full->j[k]; + (*A_app_inv)->val[k] = e_j[k - A_spar_patt_full->start[i]]; } //empty variables that will be used next iteration - realloc( dense_matrix, 0); - realloc( e_j, 0 ); - for( i = 0; i < A->n; ++i ) - I[i] = J[i] = pos_i[i] = pos_j[i] = 0; + srealloc( dense_matrix, 0, "Sparse_Approx_Inverse::dense_matrix" ); + srealloc( e_j, 0, "Sparse_Approx_Inverse::e_j" ); + for ( i = 0; i < A->n; ++i ) + { + I[i] = 0; + J[i] = 0; + pos_i[i] = 0; + pos_j[i] = 0; + } } // Deallocate? - Deallocate_Matrix ( A_full ); - Deallocate_Matrix ( A_SP_full ); - realloc( I, 0 ); - realloc( J, 0 ); - realloc( pos_i, 0 ); - realloc( pos_j, 0 ); + Deallocate_Matrix( A_full ); + Deallocate_Matrix( A_spar_patt_full ); + srealloc( I, 0, "Sparse_Approx_Inverse::I" ); + srealloc( J, 0, "Sparse_Approx_Inverse::J" ); + srealloc( pos_i, 0, "Sparse_Approx_Inverse::pos_i" ); + srealloc( pos_j, 0, "Sparse_Approx_Inverse::pos_j" ); return Get_Timing_Info( start ); } +#endif + /* sparse matrix-vector product Ax=b * where: @@ -1688,7 +1799,7 @@ real SAI_PAR( const sparse_matrix * const A, const sparse_matrix * const A_SP, * x: vector * b: vector (result) */ static void Sparse_MatVec( const sparse_matrix * const A, - const real * const x, real * const b ) + const real * const x, real * const b ) { int i, j, k, n, si, ei; real H; @@ -1702,7 +1813,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, #ifdef _OPENMP tid = omp_get_thread_num( ); -#pragma omp single + #pragma omp single { /* keep b_local for program duration to avoid allocate/free * overhead per Sparse_MatVec call*/ @@ -1717,7 +1828,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, Vector_MakeZero( (real * const)b_local, omp_get_num_threads() * n ); -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < n; ++i ) { @@ -1746,7 +1857,7 @@ static void Sparse_MatVec( const sparse_matrix * const A, } #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) for ( i = 0; i < n; ++i ) { for ( j = 0; j < omp_get_num_threads(); ++j ) @@ -1838,12 +1949,12 @@ void Transpose_I( sparse_matrix * const A ) * N: dimensions of preconditioner and vectors (# rows in H) */ static void diag_pre_app( const real * const Hdia_inv, const real * const y, - real * const x, const int N ) + real * const x, const int N ) { unsigned int i; #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < N; ++i ) { @@ -1864,13 +1975,13 @@ static void diag_pre_app( const real * const Hdia_inv, const real * const y, * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ void tri_solve( const sparse_matrix * const LU, const real * const y, - real * const x, const int N, const TRIANGULARITY tri ) + real * const x, const int N, const TRIANGULARITY tri ) { int i, pj, j, si, ei; real val; #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { if ( tri == LOWER ) @@ -1922,13 +2033,13 @@ void tri_solve( const sparse_matrix * const LU, const real * const y, * LU has non-zero diagonals * Each row of LU has at least one non-zero (i.e., no rows with all zeros) */ void tri_solve_level_sched( const sparse_matrix * const LU, - const real * const y, real * const x, const int N, - const TRIANGULARITY tri, int find_levels ) + const real * const y, real * const x, const int N, + const TRIANGULARITY tri, int find_levels ) { int i, j, pj, local_row, local_level; #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { if ( tri == LOWER ) @@ -2035,7 +2146,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, for ( i = 0; i < levels; ++i ) { #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) { @@ -2055,7 +2166,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, for ( i = 0; i < levels; ++i ) { #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( j = level_rows_cnt[i]; j < level_rows_cnt[i + 1]; ++j ) { @@ -2072,7 +2183,7 @@ void tri_solve_level_sched( const sparse_matrix * const LU, } #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { /* save level info for re-use if performing repeated triangular solves via preconditioning */ @@ -2120,7 +2231,7 @@ static void compute_H_full( const sparse_matrix * const H ) H_full->j[count] = H->j[pj]; ++count; } - /* H^T: symmetric, upper triangular portion only stored; + /* H^T: symmetric, upper triangular portion only stored; * skip diagonal from H^T, as included from H above */ for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj ) { @@ -2134,46 +2245,6 @@ static void compute_H_full( const sparse_matrix * const H ) Deallocate_Matrix( H_t ); } -static void compute_H_full_for_SAI( const sparse_matrix * const H , sparse_matrix * H_new) -{ - int count, i, pj; - sparse_matrix *H_t; - - if ( Allocate_Matrix( &H_t, H->n, H->m ) == FAILURE || - Allocate_Matrix( &H_new, H->n, 2 * H->m - H->n ) == FAILURE ) - { - fprintf( stderr, "not enough memory for full H. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - - /* Set up the sparse matrix data structure for A. */ - Transpose( H, H_t ); - - count = 0; - for ( i = 0; i < H->n; ++i ) - { - H_new->start[i] = count; - - /* H: symmetric, lower triangular portion only stored */ - for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj ) - { - H_new->val[count] = H->val[pj]; - H_new->j[count] = H->j[pj]; - ++count; - } - /* H^T: symmetric, upper triangular portion only stored; - * skip diagonal from H^T, as included from H above */ - for ( pj = H_t->start[i] + 1; pj < H_t->start[i + 1]; ++pj ) - { - H_new->val[count] = H_t->val[pj]; - H_new->j[count] = H_t->j[pj]; - ++count; - } - } - H_new->start[i] = count; - - Deallocate_Matrix( H_t ); -} /* Iterative greedy shared-memory parallel graph coloring * @@ -2185,14 +2256,14 @@ static void compute_H_full_for_SAI( const sparse_matrix * const H , sparse_matri * * Reference: * Umit V. Catalyurek et al. - * Graph Coloring Algorithms for Multi-core + * Graph Coloring Algorithms for Multi-core * and Massively Threaded Architectures * Parallel Computing, 2012 */ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) { #ifdef _OPENMP -#pragma omp parallel + #pragma omp parallel #endif { #define MAX_COLOR (500) @@ -2209,7 +2280,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) #endif #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { memset( color, 0, sizeof(unsigned int) * A->n ); @@ -2221,7 +2292,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) if ( tri == LOWER ) { #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -2231,7 +2302,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) else { #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < A->n; ++i ) { @@ -2247,7 +2318,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier #endif while ( recolor_cnt > 0 ) @@ -2256,7 +2327,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) /* color vertices */ #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < recolor_cnt; ++i ) { @@ -2284,14 +2355,14 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) recolor_cnt_local = 0; #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { recolor_cnt = 0; } #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < temp; ++i ) { @@ -2313,9 +2384,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) conflict_cnt[tid + 1] = recolor_cnt_local; #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier -#pragma omp master + #pragma omp master #endif { conflict_cnt[0] = 0; @@ -2327,7 +2398,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier #endif /* copy thread-local conflicts into shared buffer */ @@ -2338,9 +2409,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier -#pragma omp single + #pragma omp single #endif { temp_ptr = to_color; @@ -2363,7 +2434,7 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) //#endif #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier #endif } } @@ -2382,7 +2453,7 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri ) /* sort vertices by color (ascending within a color) * 1) count colors - * 2) determine offsets of color ranges + * 2) determine offsets of color ranges * 3) sort by color * * note: color is 1-based */ @@ -2417,12 +2488,12 @@ void sort_colors( const unsigned int n, const TRIANGULARITY tri ) * tri: coloring to triangular factor to use (lower/upper) */ static void permute_vector( real * const x, const unsigned int n, const int invert_map, - const TRIANGULARITY tri ) + const TRIANGULARITY tri ) { unsigned int i; #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { if ( x_p == NULL ) @@ -2445,7 +2516,7 @@ static void permute_vector( real * const x, const unsigned int n, const int inve } #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < n; ++i ) { @@ -2453,7 +2524,7 @@ static void permute_vector( real * const x, const unsigned int n, const int inve } #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { memcpy( x, x_p, sizeof(real) * n ); @@ -2609,7 +2680,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) if ( color == NULL ) { #ifdef _OPENMP -#pragma omp parallel + #pragma omp parallel { num_thread = omp_get_num_threads(); } @@ -2619,7 +2690,7 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */ if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || - (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || + (to_color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (conflict_cnt = (unsigned int*) malloc(sizeof(unsigned int) * (num_thread + 1))) == NULL || (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || @@ -2661,15 +2732,15 @@ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) * Note: Newmann series arises from series expansion of the inverse of * the coefficient matrix in the triangular system */ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, - const real * const b, real * const x, const TRIANGULARITY tri, const - unsigned int maxiter ) + const real * const b, real * const x, const TRIANGULARITY tri, const + unsigned int maxiter ) { unsigned int i, k, si = 0, ei = 0, iter; iter = 0; #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { if ( Dinv_b == NULL ) @@ -2702,7 +2773,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, /* precompute and cache, as invariant in loop below */ #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif for ( i = 0; i < R->n; ++i ) { @@ -2713,7 +2784,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, { // x_{k+1} = G*x_{k} + Dinv*b; #ifdef _OPENMP -#pragma omp for schedule(guided) + #pragma omp for schedule(guided) #endif for ( i = 0; i < R->n; ++i ) { @@ -2741,7 +2812,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, } #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { rp3 = rp; @@ -2769,7 +2840,7 @@ void jacobi_iter( const sparse_matrix * const R, const real * const Dinv, * Matrices have non-zero diagonals * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ static void apply_preconditioner( const static_storage * const workspace, const control_params * const control, - const real * const y, real * const x, const int fresh_pre ) + const real * const y, real * const x, const int fresh_pre ) { int i, si; @@ -2782,157 +2853,157 @@ static void apply_preconditioner( const static_storage * const workspace, const { switch ( control->cm_solver_pre_app_type ) { - case TRI_SOLVE_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); - break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: - tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); - tri_solve( workspace->U, x, x, workspace->U->n, UPPER ); - break; - case SAI_PC: - //TODO: add code to compute SAI first - // Sparse_MatVec( SAI, y, x ); - break; - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - } + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); break; - case TRI_SOLVE_LEVEL_SCHED_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); - break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: - tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); - tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); - break; - case SAI_PC: - //TODO: add code to compute SAI first - // Sparse_MatVec( SAI, y, x ); - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - } + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); + tri_solve( workspace->U, x, x, workspace->U->n, UPPER ); break; - case TRI_SOLVE_GC_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - case SAI_PC: - fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: + case SAI_PC: + //TODO: add code to compute SAI first + // Sparse_MatVec( SAI, y, x ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + diag_pre_app( workspace->Hdia_inv, y, x, workspace->H->n ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve_level_sched( workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); + tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); + break; + case SAI_PC: + //TODO: add code to compute SAI first + // Sparse_MatVec( SAI, y, x ); + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif - { - memcpy( y_p, y, sizeof(real) * workspace->H->n ); - } + { + memcpy( y_p, y, sizeof(real) * workspace->H->n ); + } - permute_vector( y_p, workspace->H->n, FALSE, LOWER ); - tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre ); - tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); - permute_vector( x, workspace->H->n, TRUE, UPPER ); - break; - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - } + permute_vector( y_p, workspace->H->n, FALSE, LOWER ); + tri_solve_level_sched( workspace->L, y_p, x, workspace->L->n, LOWER, fresh_pre ); + tri_solve_level_sched( workspace->U, x, x, workspace->U->n, UPPER, fresh_pre ); + permute_vector( x, workspace->H->n, TRUE, UPPER ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); break; - case JACOBI_ITER_PA: - switch ( control->cm_solver_pre_comp_type ) - { - case DIAG_PC: - case SAI_PC: - fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); - exit( INVALID_INPUT ); - break; - case ICHOLT_PC: - case ILU_PAR_PC: - case ILUT_PAR_PC: + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif - { - if ( Dinv_L == NULL ) - { - if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - } - } + { + if ( Dinv_L == NULL ) + { + if ( (Dinv_L = (real*) malloc(sizeof(real) * workspace->L->n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + } - /* construct D^{-1}_L */ - if ( fresh_pre == TRUE ) - { + /* construct D^{-1}_L */ + if ( fresh_pre == TRUE ) + { #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif - for ( i = 0; i < workspace->L->n; ++i ) - { - si = workspace->L->start[i + 1] - 1; - Dinv_L[i] = 1. / workspace->L->val[si]; - } - } + for ( i = 0; i < workspace->L->n; ++i ) + { + si = workspace->L->start[i + 1] - 1; + Dinv_L[i] = 1. / workspace->L->val[si]; + } + } - jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); + jacobi_iter( workspace->L, Dinv_L, y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif - { - if ( Dinv_U == NULL ) - { - if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) - { - fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); - exit( INSUFFICIENT_MEMORY ); - } - } - } + { + if ( Dinv_U == NULL ) + { + if ( (Dinv_U = (real*) malloc(sizeof(real) * workspace->U->n)) == NULL ) + { + fprintf( stderr, "not enough memory for Jacobi iteration matrices. terminating.\n" ); + exit( INSUFFICIENT_MEMORY ); + } + } + } - /* construct D^{-1}_U */ - if ( fresh_pre == TRUE ) - { + /* construct D^{-1}_U */ + if ( fresh_pre == TRUE ) + { #ifdef _OPENMP -#pragma omp for schedule(static) + #pragma omp for schedule(static) #endif - for ( i = 0; i < workspace->U->n; ++i ) - { - si = workspace->U->start[i]; - Dinv_U[i] = 1. / workspace->U->val[si]; - } - } - - jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); - break; - default: - fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); - exit( INVALID_INPUT ); - break; + for ( i = 0; i < workspace->U->n; ++i ) + { + si = workspace->U->start[i]; + Dinv_U[i] = 1. / workspace->U->val[si]; } - break; + } + + jacobi_iter( workspace->U, Dinv_U, y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); + break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; } } @@ -2941,8 +3012,8 @@ static void apply_preconditioner( const static_storage * const workspace, const /* generalized minimual residual iterative solver for sparse linear systems */ int GMRES( const static_storage * const workspace, const control_params * const control, - simulation_data * const data, const sparse_matrix * const H, const real * const b, - const real tol, real * const x, const int fresh_pre ) + simulation_data * const data, const sparse_matrix * const H, const real * const b, + const real tol, real * const x, const int fresh_pre ) { int i, j, k, itr, N, g_j, g_itr; real cc, tmp1, tmp2, temp, ret_temp, bnorm, time_start; @@ -2950,7 +3021,7 @@ int GMRES( const static_storage * const workspace, const control_params * const N = H->n; #ifdef _OPENMP -#pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \ + #pragma omp parallel default(none) private(i, j, k, itr, bnorm, ret_temp) \ shared(N, cc, tmp1, tmp2, temp, time_start, g_itr, g_j, stderr) #endif { @@ -2958,14 +3029,14 @@ int GMRES( const static_storage * const workspace, const control_params * const itr = 0; #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } bnorm = Norm( b, N ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -2975,14 +3046,14 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* apply preconditioner to residual */ #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } apply_preconditioner( workspace, control, b, workspace->b_prc, fresh_pre ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -2994,14 +3065,14 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* calculate r0 */ #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } Sparse_MatVec( H, x, workspace->b_prm ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); @@ -3010,14 +3081,14 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( control->cm_solver_pre_comp_type == DIAG_PC ) { #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } apply_preconditioner( workspace, control, workspace->b_prm, workspace->b_prm, FALSE ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -3027,14 +3098,14 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( control->cm_solver_pre_comp_type == DIAG_PC ) { #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } Vector_Sum( workspace->v[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -3043,14 +3114,14 @@ int GMRES( const static_storage * const workspace, const control_params * const else { #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } Vector_Sum( workspace->v[0], 1., b, -1., workspace->b_prm, N ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -3060,15 +3131,15 @@ int GMRES( const static_storage * const workspace, const control_params * const if ( control->cm_solver_pre_comp_type != DIAG_PC ) { #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } apply_preconditioner( workspace, control, workspace->v[0], workspace->v[0], - itr == 0 ? fresh_pre : FALSE ); + itr == 0 ? fresh_pre : FALSE ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -3076,14 +3147,14 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } ret_temp = Norm( workspace->v[0], N ); #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { workspace->g[0] = ret_temp; @@ -3091,7 +3162,7 @@ int GMRES( const static_storage * const workspace, const control_params * const Vector_Scale( workspace->v[0], 1. / workspace->g[0], workspace->v[0], N ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -3102,7 +3173,7 @@ int GMRES( const static_storage * const workspace, const control_params * const { /* matvec */ #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); @@ -3110,14 +3181,14 @@ int GMRES( const static_storage * const workspace, const control_params * const Sparse_MatVec( H, workspace->v[j], workspace->v[j + 1] ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_spmv += Get_Timing_Info( time_start ); } #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); @@ -3125,7 +3196,7 @@ int GMRES( const static_storage * const workspace, const control_params * const apply_preconditioner( workspace, control, workspace->v[j + 1], workspace->v[j + 1], FALSE ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_pre_app += Get_Timing_Info( time_start ); @@ -3135,7 +3206,7 @@ int GMRES( const static_storage * const workspace, const control_params * const // { /* apply modified Gram-Schmidt to orthogonalize the new residual */ #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); @@ -3145,7 +3216,7 @@ int GMRES( const static_storage * const workspace, const control_params * const ret_temp = Dot( workspace->v[i], workspace->v[j + 1], N ); #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { workspace->h[i][j] = ret_temp; @@ -3155,7 +3226,7 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -3202,31 +3273,31 @@ int GMRES( const static_storage * const workspace, const control_params * const // } #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); } ret_temp = Norm( workspace->v[j + 1], N ); #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif { workspace->h[j + 1][j] = ret_temp; } Vector_Scale( workspace->v[j + 1], - 1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N ); + 1.0 / workspace->h[j + 1][j], workspace->v[j + 1], N ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); } #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); @@ -3244,9 +3315,9 @@ int GMRES( const static_storage * const workspace, const control_params * const } tmp1 = workspace->hc[i] * workspace->h[i][j] + - workspace->hs[i] * workspace->h[i + 1][j]; + workspace->hs[i] * workspace->h[i + 1][j]; tmp2 = -workspace->hs[i] * workspace->h[i][j] + - workspace->hc[i] * workspace->h[i + 1][j]; + workspace->hc[i] * workspace->h[i + 1][j]; workspace->h[i][j] = tmp1; workspace->h[i + 1][j] = tmp2; @@ -3285,13 +3356,13 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier #endif } /* solve Hy = g: H is now upper-triangular, do back-substitution */ #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { time_start = Get_Time( ); @@ -3323,7 +3394,7 @@ int GMRES( const static_storage * const workspace, const control_params * const Vector_Add( x, 1., workspace->p, N ); #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { data->timing.cm_solver_vector_ops += Get_Timing_Info( time_start ); @@ -3337,7 +3408,7 @@ int GMRES( const static_storage * const workspace, const control_params * const } #ifdef _OPENMP -#pragma omp master + #pragma omp master #endif { g_itr = itr; @@ -3356,9 +3427,9 @@ int GMRES( const static_storage * const workspace, const control_params * const int GMRES_HouseHolder( const static_storage * const workspace, - const control_params * const control, simulation_data * const data, - const sparse_matrix * const H, const real * const b, real tol, - real * const x, const int fresh_pre ) + const control_params * const control, simulation_data * const data, + const sparse_matrix * const H, const real * const b, real tol, + real * const x, const int fresh_pre ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; @@ -3565,7 +3636,7 @@ int CG( const static_storage * const workspace, const control_params * const con z = workspace->p; #ifdef _OPENMP -#pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \ + #pragma omp parallel default(none) private(i, tmp, alpha, beta, b_norm, r_norm, sig_old, sig_new) \ shared(itr, N, d, r, p, z) #endif { @@ -3601,7 +3672,7 @@ int CG( const static_storage * const workspace, const control_params * const con } #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif itr = i; } @@ -3618,8 +3689,8 @@ int CG( const static_storage * const workspace, const control_params * const con /* Steepest Descent */ int SDM( const static_storage * const workspace, const control_params * const control, - const sparse_matrix * const H, const real * const b, const real tol, - real * const x, const int fresh_pre ) + const sparse_matrix * const H, const real * const b, const real tol, + real * const x, const int fresh_pre ) { int i, itr, N; real tmp, alpha, b_norm; @@ -3628,14 +3699,14 @@ int SDM( const static_storage * const workspace, const control_params * const co N = H->n; #ifdef _OPENMP -#pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \ + #pragma omp parallel default(none) private(i, tmp, alpha, b_norm, sig) \ shared(itr, N) #endif { b_norm = Norm( b, N ); Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->q, N ); + Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->q, N ); apply_preconditioner( workspace, control, workspace->r, workspace->d, fresh_pre ); @@ -3652,7 +3723,7 @@ int SDM( const static_storage * const workspace, const control_params * const co * (Dot function has persistent state in the form * of a shared global variable for the OpenMP version) */ #ifdef _OPENMP -#pragma omp barrier + #pragma omp barrier #endif tmp = Dot( workspace->d, workspace->q, N ); @@ -3665,7 +3736,7 @@ int SDM( const static_storage * const workspace, const control_params * const co } #ifdef _OPENMP -#pragma omp single + #pragma omp single #endif itr = i; } diff --git a/sPuReMD/src/lin_alg.h b/sPuReMD/src/lin_alg.h index ceb2a18bfcfb6dc8ddaff3e8c807459a9d401d0d..91b0a5152dd5ec682be5791afb962b6a86f13702 100644 --- a/sPuReMD/src/lin_alg.h +++ b/sPuReMD/src/lin_alg.h @@ -32,6 +32,9 @@ typedef enum void Sort_Matrix_Rows( sparse_matrix * const ); +void Setup_Sparsity_Pattern( const sparse_matrix * const, + const real, sparse_matrix * ); + int Estimate_LU_Fill( const sparse_matrix * const, const real * const ); void Calculate_Droptol( const sparse_matrix * const, @@ -58,6 +61,11 @@ real ILU_PAR( const sparse_matrix * const, const unsigned int, real ILUT_PAR( const sparse_matrix * const, const real *, const unsigned int, sparse_matrix * const, sparse_matrix * const ); +#if defined(HAVE_LAPACK) +real Sparse_Approx_Inverse( const sparse_matrix * const, const sparse_matrix * const, + sparse_matrix ** ); +#endif + void Transpose( const sparse_matrix * const, sparse_matrix * const ); void Transpose_I( sparse_matrix * const ); diff --git a/sPuReMD/src/mytypes.h b/sPuReMD/src/mytypes.h index e68c830aa6174255183bbac5eb03b483e54d7fcb..a66758b1fb5d00dad29edf656ba536dc7d7fdee8 100644 --- a/sPuReMD/src/mytypes.h +++ b/sPuReMD/src/mytypes.h @@ -607,6 +607,7 @@ typedef struct unsigned int cm_solver_pre_comp_refactor; real cm_solver_pre_comp_droptol; unsigned int cm_solver_pre_comp_sweeps; + unsigned int cm_solver_pre_comp_sai_thres; unsigned int cm_solver_pre_app_type; unsigned int cm_solver_pre_app_jacobi_iters; @@ -890,6 +891,8 @@ typedef struct /* charge method storage */ sparse_matrix *H; sparse_matrix *H_sp; + sparse_matrix *H_spar_patt; + sparse_matrix *H_app_inv; sparse_matrix *L; sparse_matrix *U; real *droptol; diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c index 1ef0a527273e4466760866ebab8e6131d5083746..383fdb8941e1f3c012191a965dc6aa1b05790b6a 100644 --- a/sPuReMD/src/print_utils.c +++ b/sPuReMD/src/print_utils.c @@ -406,7 +406,7 @@ void Print_Near_Neighbors( reax_system *system, control_params *control, FILE *fout; reax_list *near_nbrs = &((*lists)[NEAR_NBRS]); - sprintf( fname, "%s.near_nbrs", control->sim_name ); + snprintf( fname, MAX_STR, "%s.near_nbrs", control->sim_name ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -440,7 +440,7 @@ void Print_Near_Neighbors2( reax_system *system, control_params *control, FILE *fout; reax_list *near_nbrs = &((*lists)[NEAR_NBRS]); - sprintf( fname, "%s.near_nbrs_lgj", control->sim_name ); + snprintf( fname, MAX_STR, "%s.near_nbrs_lgj", control->sim_name ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -475,7 +475,7 @@ void Print_Far_Neighbors( reax_system *system, control_params *control, FILE *fout; reax_list *far_nbrs = &((*lists)[FAR_NBRS]); - sprintf( fname, "%s.far_nbrs", control->sim_name ); + snprintf( fname, MAX_STR, "%s.far_nbrs", control->sim_name ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -520,7 +520,7 @@ void Print_Far_Neighbors2( reax_system *system, control_params *control, FILE *fout; reax_list *far_nbrs = &((*lists)[FAR_NBRS]); - sprintf( fname, "%s.far_nbrs_lgj", control->sim_name ); + snprintf( fname, MAX_STR, "%s.far_nbrs_lgj", control->sim_name ); fout = fopen( fname, "w" ); int num = 0; int temp[500]; @@ -553,7 +553,7 @@ void Print_Total_Force( reax_system *system, control_params *control, int i; #if !defined(TEST_FORCES) char temp[1000]; - sprintf( temp, "%s.ftot", control->sim_name ); + snprintf( temp, 1000, "%s.ftot", control->sim_name ); out_control->ftot = fopen( temp, "w" ); #endif @@ -735,7 +735,7 @@ void Print_Linear_System( reax_system *system, control_params *control, sparse_matrix *H; FILE *out; - sprintf( fname, "%s.state%d.out", control->sim_name, step ); + snprintf( fname, 100, "%s.state%d.out", control->sim_name, step ); out = fopen( fname, "w" ); for ( i = 0; i < system->N_cm; i++ ) @@ -747,13 +747,13 @@ void Print_Linear_System( reax_system *system, control_params *control, workspace->t[0][i], workspace->b_t[i] ); fclose( out ); - // sprintf( fname, "x2_%d", step ); + // snprintf( fname, 100, "x2_%d", step ); // out = fopen( fname, "w" ); // for( i = 0; i < system->N; i++ ) // fprintf( out, "%g\n", workspace->s_t[i+system->N] ); // fclose( out ); - sprintf( fname, "%s.H%d.out", control->sim_name, step ); + snprintf( fname, 100, "%s.H%d.out", control->sim_name, step ); out = fopen( fname, "w" ); H = workspace->H; @@ -776,7 +776,7 @@ void Print_Linear_System( reax_system *system, control_params *control, fclose( out ); - sprintf( fname, "%s.H_sp%d.out", control->sim_name, step ); + snprintf( fname, 100, "%s.H_sp%d.out", control->sim_name, step ); out = fopen( fname, "w" ); H = workspace->H_sp; @@ -799,13 +799,13 @@ void Print_Linear_System( reax_system *system, control_params *control, fclose( out ); - /*sprintf( fname, "%s.b_s%d", control->sim_name, step ); + /*snprintf( fname, 100, "%s.b_s%d", control->sim_name, step ); out = fopen( fname, "w" ); for( i = 0; i < system->N; i++ ) fprintf( out, "%12.7f\n", workspace->b_s[i] ); fclose( out ); - sprintf( fname, "%s.b_t%d", control->sim_name, step ); + snprintf( fname, 100, "%s.b_t%d", control->sim_name, step ); out = fopen( fname, "w" ); for( i = 0; i < system->N; i++ ) fprintf( out, "%12.7f\n", workspace->b_t[i] ); @@ -820,7 +820,7 @@ void Print_Charges( reax_system *system, control_params *control, char fname[100]; FILE *fout; - sprintf( fname, "%s.q%d", control->sim_name, step ); + snprintf( fname, 100, "%s.q%d", control->sim_name, step ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; ++i ) @@ -1002,7 +1002,7 @@ Print_XYZ_Serial( reax_system* system, static_storage *workspace ) FILE *fout; int i; - sprintf( fname, "READ_PDB.0" ); + snprintf( fname, 100, "READ_PDB.0" ); fout = fopen( fname, "w" ); for ( i = 0; i < system->N; i++ ) diff --git a/sPuReMD/src/restart.c b/sPuReMD/src/restart.c index c15d51c502f5960984c38f5485384c122878b720..0f7b1c5bdcbacdb134f6ab4acd944fb8f5ef109d 100644 --- a/sPuReMD/src/restart.c +++ b/sPuReMD/src/restart.c @@ -23,8 +23,9 @@ #include "box.h" #include "vector.h" + void Write_Binary_Restart( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace ) + simulation_data *data, static_storage *workspace ) { int i; char fname[MAX_STR]; @@ -33,7 +34,7 @@ void Write_Binary_Restart( reax_system *system, control_params *control, restart_header res_header; restart_atom res_data; - sprintf( fname, "%s.res%d", control->sim_name, data->step ); + snprintf( fname, MAX_STR + 8, "%s.res%d", control->sim_name, data->step ); fres = fopen( fname, "wb" ); res_header.step = data->step; @@ -139,14 +140,14 @@ void Read_Binary_Restart( char *fname, reax_system *system, void Write_ASCII_Restart( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace ) + simulation_data *data, static_storage *workspace ) { int i; char fname[MAX_STR]; FILE *fres; reax_atom *p_atom; - sprintf( fname, "%s.res%d", control->sim_name, data->step ); + snprintf( fname, MAX_STR + 8, "%s.res%d", control->sim_name, data->step ); fres = fopen( fname, "w" ); fprintf( fres, RESTART_HEADER, diff --git a/sPuReMD/src/traj.c b/sPuReMD/src/traj.c index bfdd2fcfa34680e701cfa22a7f152accf42d0d27..14e2a36b0cc028d974211a51671d52ec4123d044 100644 --- a/sPuReMD/src/traj.c +++ b/sPuReMD/src/traj.c @@ -31,13 +31,15 @@ int Write_Custom_Header( reax_system *system, control_params *control, static_storage *workspace, output_controls *out_control ) { +#define SIZE1 (2048) +#define SIZE2 (100) int i, header_len, control_block_len, frame_format_len; // char buffer[2048]; - char control_block[2048]; - char frame_format[2048]; - char atom_format[100], bond_format[100], angle_format[100]; + char control_block[SIZE1]; + char frame_format[SIZE1]; + char atom_format[SIZE2], bond_format[SIZE2], angle_format[SIZE2]; - sprintf( control_block, CONTROL_BLOCK, + snprintf( control_block, SIZE1, CONTROL_BLOCK, system->N, control->restart, control->restart_from, @@ -80,23 +82,23 @@ int Write_Custom_Header( reax_system *system, control_params *control, control_block_len = strlen( control_block ); - sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", + snprintf( frame_format, SIZE1, "Frame Format: %d\n%s\n%s\n", NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES ); atom_format[0] = OPT_NOATOM; switch ( out_control->atom_format ) { case OPT_ATOM_BASIC: - sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC ); + snprintf( atom_format, SIZE2, "Atom_Basic: %s", ATOM_BASIC ); break; case OPT_ATOM_wF: - sprintf( atom_format, "Atom_wF: %s", ATOM_wF ); + snprintf( atom_format, SIZE2, "Atom_wF: %s", ATOM_wF ); break; case OPT_ATOM_wV: - sprintf( atom_format, "Atom_wV: %s", ATOM_wV ); + snprintf( atom_format, SIZE2, "Atom_wV: %s", ATOM_wV ); break; case OPT_ATOM_FULL: - sprintf( atom_format, "Atom_Full: %s", ATOM_FULL ); + snprintf( atom_format, SIZE2, "Atom_Full: %s", ATOM_FULL ); break; default: break; @@ -106,18 +108,18 @@ int Write_Custom_Header( reax_system *system, control_params *control, bond_format[0] = OPT_NOBOND; if ( out_control->bond_info == OPT_BOND_BASIC ) { - sprintf( bond_format, "Bond_Line: %s", BOND_BASIC ); + snprintf( bond_format, SIZE2, "Bond_Line: %s", BOND_BASIC ); } else if ( out_control->bond_info == OPT_BOND_FULL ) { - sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL ); + snprintf( bond_format, SIZE2, "Bond_Line_Full: %s", BOND_FULL ); } strcat( frame_format, bond_format ); angle_format[0] = OPT_NOANGLE; if ( out_control->angle_info == OPT_ANGLE_BASIC ) { - sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC ); + snprintf( angle_format, SIZE2, "Angle_Line: %s", ANGLE_BASIC ); } strcat( frame_format, angle_format ); @@ -158,6 +160,9 @@ int Write_Custom_Header( reax_system *system, control_params *control, fflush( out_control->trj ); +#undef SIZE2 +#undef SIZE1 + return 0; } @@ -166,12 +171,13 @@ int Append_Custom_Frame( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, reax_list **lists, output_controls *out_control ) { +#define SIZE (2048) int i, j, pi, pk, pk_j; int write_atoms, write_bonds, write_angles; int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len; int frame_globals_len, num_bonds, num_thb_intrs; real P; - char buffer[2048]; + char buffer[SIZE]; reax_list *bonds = (*lists) + BONDS; reax_list *thb_intrs = (*lists) + THREE_BODIES; bond_data *bo_ij; @@ -290,7 +296,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control, } /* calculate total frame length*/ - sprintf( buffer, FRAME_GLOBALS, + snprintf( buffer, SIZE, FRAME_GLOBALS, data->step, data->time, data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T, P, system->box.volume, @@ -476,6 +482,8 @@ int Append_Custom_Frame( reax_system *system, control_params *control, fflush( out_control->trj ); +#undef SIZE + return 0; }