diff --git a/PG-PuReMD/src/basic_comm.c b/PG-PuReMD/src/basic_comm.c index 5d2afdeb38ec397ce2a82b1112ea5ca1c15dabce..b81180ec1626735aa8f141594b538ecd2741d83a 100644 --- a/PG-PuReMD/src/basic_comm.c +++ b/PG-PuReMD/src/basic_comm.c @@ -34,6 +34,19 @@ typedef void (*dist_packer)( void*, mpi_out_data* ); typedef void (*coll_unpacker)( void*, void*, mpi_out_data* ); +static void int_packer( void *dummy, mpi_out_data *out_buf ) +{ + int i; + int *buf = (int*) dummy; + int *out = (int*) out_buf->out_atoms; + + for ( i = 0; i < out_buf->cnt; ++i ) + { + out[i] = buf[ out_buf->index[i] ]; + } +} + + static void real_packer( void *dummy, mpi_out_data *out_buf ) { int i; @@ -77,6 +90,21 @@ static void rvec2_packer( void *dummy, mpi_out_data *out_buf ) } +static void int_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) +{ + int i; + int *in, *buf; + + in = (int*) dummy_in; + buf = (int*) dummy_buf; + + for ( i = 0; i < out_buf->cnt; ++i ) + { + buf[ out_buf->index[i] ] += in[i]; + } +} + + static void real_unpacker( void *dummy_in, void *dummy_buf, mpi_out_data *out_buf ) { int i; @@ -135,6 +163,10 @@ static void * Get_Buffer_Offset( const void * const buffer, switch ( type ) { + case INT_PTR_TYPE: + ptr = (int *) buffer + offset; + break; + case REAL_PTR_TYPE: ptr = (real *) buffer + offset; break; diff --git a/PG-PuReMD/src/basic_comm.h b/PG-PuReMD/src/basic_comm.h index 5b12d681b243e01faa85398d733bf765ddd9f0b0..5f13d4eb0a2df8f5c25cf65ddf254cd1ca2a60c4 100644 --- a/PG-PuReMD/src/basic_comm.h +++ b/PG-PuReMD/src/basic_comm.h @@ -27,9 +27,10 @@ enum pointer_type { - REAL_PTR_TYPE = 0, - RVEC_PTR_TYPE = 1, - RVEC2_PTR_TYPE = 2, + INT_PTR_TYPE = 0, + REAL_PTR_TYPE = 1, + RVEC_PTR_TYPE = 2, + RVEC2_PTR_TYPE = 3, }; diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c index a98abddeda8ed9757a631e140f406d6d2dd78ff2..364e0805844046fd09020a97b47ddc6ffdcdae3e 100644 --- a/PG-PuReMD/src/charges.c +++ b/PG-PuReMD/src/charges.c @@ -179,7 +179,7 @@ static void Extrapolate_Charges_QEq( const reax_system * const system, else if ( control->cm_init_guess_extrap1 == 3 ) { s_tmp = 4.0 * (system->my_atoms[i].s[0] + system->my_atoms[i].s[2]) - - (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]); + (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]); } else { @@ -277,7 +277,7 @@ static void Extrapolate_Charges_EE( const reax_system * const system, else if ( control->cm_init_guess_extrap1 == 3 ) { s_tmp = 4.0 * (system->my_atoms[i].s[0] + system->my_atoms[i].s[2]) - - (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]); + (6.0 * system->my_atoms[i].s[1] + system->my_atoms[i].s[3]); } else { @@ -309,10 +309,11 @@ static void Extrapolate_Charges_EE_Part2( const reax_system * const system, /* Compute preconditioner for QEq - */ +*/ static void Compute_Preconditioner_QEq( const reax_system * const system, const control_params * const control, - simulation_data * const data, storage * const workspace ) + simulation_data * const data, storage * const workspace, + const mpi_datatypes * const mpi_data ) { sparse_matrix *Hptr; @@ -333,7 +334,7 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, case DIAG_PC: data->timing.cm_solver_pre_comp += diag_pre_comp( system, workspace->Hdia_inv ); -// diag_pre_comp( Hptr, workspace->Hdia_inv ); + // diag_pre_comp( Hptr, workspace->Hdia_inv ); break; case ICHOLT_PC: @@ -347,9 +348,9 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, case SAI_PC: #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) //TODO: implement -// data->timing.cm_solver_pre_comp += -// sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full, -// &workspace->H_app_inv ); + data->timing.cm_solver_pre_comp += + sparse_approx_inverse( system, workspace, mpi_data, + workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv ); #else fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); exit( INVALID_INPUT ); @@ -365,10 +366,11 @@ static void Compute_Preconditioner_QEq( const reax_system * const system, /* Compute preconditioner for EE - */ +*/ static void Compute_Preconditioner_EE( const reax_system * const system, const control_params * const control, - simulation_data * const data, storage * const workspace ) + simulation_data * const data, storage * const workspace, + const mpi_datatypes * const mpi_data ) { sparse_matrix *Hptr; @@ -387,7 +389,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system, { Hptr->entries[Hptr->start[system->N + 1] - 1].val = 1.0; } - + switch ( control->cm_solver_pre_comp_type ) { case NONE_PC: @@ -396,7 +398,7 @@ static void Compute_Preconditioner_EE( const reax_system * const system, case DIAG_PC: data->timing.cm_solver_pre_comp += diag_pre_comp( system, workspace->Hdia_inv ); -// diag_pre_comp( Hptr, workspace->Hdia_inv ); + // diag_pre_comp( Hptr, workspace->Hdia_inv ); break; case ICHOLT_PC: @@ -410,9 +412,9 @@ static void Compute_Preconditioner_EE( const reax_system * const system, case SAI_PC: #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) //TODO: implement -// data->timing.cm_solver_pre_comp += -// sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full, -// &workspace->H_app_inv ); + data->timing.cm_solver_pre_comp += + sparse_approx_inverse( system, workspace, mpi_data, + workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv ); #else fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); exit( INVALID_INPUT ); @@ -435,10 +437,11 @@ static void Compute_Preconditioner_EE( const reax_system * const system, /* Compute preconditioner for ACKS2 - */ +*/ static void Compute_Preconditioner_ACKS2( const reax_system * const system, const control_params * const control, - simulation_data * const data, storage * const workspace ) + simulation_data * const data, storage * const workspace, + const mpi_datatypes * const mpi_data ) { sparse_matrix *Hptr; @@ -458,7 +461,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, Hptr->entries[Hptr->start[system->N + 1] - 1].val = 1.0; Hptr->entries[Hptr->start[system->n_cm] - 1].val = 1.0; } - + switch ( control->cm_solver_pre_comp_type ) { case NONE_PC: @@ -467,7 +470,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, case DIAG_PC: data->timing.cm_solver_pre_comp += diag_pre_comp( system, workspace->Hdia_inv ); -// diag_pre_comp( Hptr, workspace->Hdia_inv ); + // diag_pre_comp( Hptr, workspace->Hdia_inv ); break; case ICHOLT_PC: @@ -481,9 +484,9 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, case SAI_PC: #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) //TODO: implement -// data->timing.cm_solver_pre_comp += -// sparse_approx_inverse( workspace->H_full, workspace->H_spar_patt_full, -// &workspace->H_app_inv ); + data->timing.cm_solver_pre_comp += + sparse_approx_inverse( system, workspace, mpi_data, + workspace->H_full, workspace->H_spar_patt_full, &workspace->H_app_inv ); #else fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); exit( INVALID_INPUT ); @@ -508,7 +511,7 @@ static void Compute_Preconditioner_ACKS2( const reax_system * const system, static void Setup_Preconditioner_QEq( const reax_system * const system, const control_params * const control, - simulation_data * const data, storage * const workspace ) + simulation_data * const data, storage * const workspace, const mpi_datatypes * const mpi_data ) { real time; sparse_matrix *Hptr; @@ -539,8 +542,8 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, case DIAG_PC: if ( workspace->Hdia_inv == NULL ) { -// workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ), -// "Setup_Preconditioner_QEq::workspace->Hdiv_inv" ); + // workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ), + // "Setup_Preconditioner_QEq::workspace->Hdiv_inv" ); } break; @@ -554,9 +557,10 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, case SAI_PC: //TODO: implement -// setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, -// &workspace->H_spar_patt_full, &workspace->H_app_inv, -// control->cm_solver_pre_comp_sai_thres ); + /*setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, + &workspace->H_spar_patt_full, &workspace->H_app_inv, + control->cm_solver_pre_comp_sai_thres );*/ + setup_sparse_approx_inverse( system, workspace, mpi_data, Hptr, &workspace->H_spar_patt, control->nprocs, control->cm_solver_pre_comp_sai_thres ); break; default: @@ -568,10 +572,11 @@ static void Setup_Preconditioner_QEq( const reax_system * const system, /* Setup routines before computing the preconditioner for EE - */ +*/ static void Setup_Preconditioner_EE( const reax_system * const system, const control_params * const control, - simulation_data * const data, storage * const workspace ) + simulation_data * const data, storage * const workspace, + const mpi_datatypes * const mpi_data ) { real time; sparse_matrix *Hptr; @@ -609,8 +614,8 @@ static void Setup_Preconditioner_EE( const reax_system * const system, case DIAG_PC: if ( workspace->Hdia_inv == NULL ) { -// workspace->Hdia_inv = scalloc( system->n_cm, sizeof( real ), -// "Setup_Preconditioner_QEq::workspace->Hdiv_inv" ); + // workspace->Hdia_inv = scalloc( system->n_cm, sizeof( real ), + // "Setup_Preconditioner_QEq::workspace->Hdiv_inv" ); } break; @@ -624,9 +629,10 @@ static void Setup_Preconditioner_EE( const reax_system * const system, case SAI_PC: //TODO: implement -// setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, -// &workspace->H_spar_patt_full, &workspace->H_app_inv, -// control->cm_solver_pre_comp_sai_thres ); + // setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, + // &workspace->H_spar_patt_full, &workspace->H_app_inv, + // control->cm_solver_pre_comp_sai_thres ); + setup_sparse_approx_inverse( system, workspace, mpi_data, Hptr, &workspace->H_spar_patt, control->nprocs, control->cm_solver_pre_comp_sai_thres ); break; default: @@ -645,10 +651,11 @@ static void Setup_Preconditioner_EE( const reax_system * const system, /* Setup routines before computing the preconditioner for ACKS2 - */ +*/ static void Setup_Preconditioner_ACKS2( const reax_system * const system, const control_params * const control, - simulation_data * const data, storage * const workspace ) + simulation_data * const data, storage * const workspace, + const mpi_datatypes * const mpi_data ) { real time; sparse_matrix *Hptr; @@ -687,8 +694,8 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, case DIAG_PC: if ( workspace->Hdia_inv == NULL ) { -// workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ), -// "Setup_Preconditioner_QEq::workspace->Hdiv_inv" ); + // workspace->Hdia_inv = scalloc( Hptr->n, sizeof( real ), + // "Setup_Preconditioner_QEq::workspace->Hdiv_inv" ); } break; @@ -702,9 +709,10 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, case SAI_PC: //TODO: implement -// setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, -// &workspace->H_spar_patt_full, &workspace->H_app_inv, -// control->cm_solver_pre_comp_sai_thres ); + // setup_sparse_approx_inverse( Hptr, &workspace->H_full, &workspace->H_spar_patt, + // &workspace->H_spar_patt_full, &workspace->H_app_inv, + // control->cm_solver_pre_comp_sai_thres ); + setup_sparse_approx_inverse( system, workspace, mpi_data, Hptr, &workspace->H_spar_patt, control->nprocs, control->cm_solver_pre_comp_sai_thres ); break; default: @@ -724,7 +732,7 @@ static void Setup_Preconditioner_ACKS2( const reax_system * const system, /* Combine ficticious charges s and t to get atomic charge q for QEq method - */ +*/ static void Calculate_Charges_QEq( const reax_system * const system, storage * const workspace, mpi_datatypes * const mpi_data ) { @@ -764,7 +772,7 @@ static void Calculate_Charges_QEq( const reax_system * const system, /* Get atomic charge q for EE method - */ +*/ static void Calculate_Charges_EE( const reax_system * const system, storage * const workspace, mpi_datatypes * const mpi_data ) { @@ -795,33 +803,33 @@ static void QEq( reax_system * const system, control_params * const control, if ( control->cm_solver_pre_comp_refactor > 0 && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ) - + { - Setup_Preconditioner_QEq( system, control, data, workspace ); + Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data ); - Compute_Preconditioner_QEq( system, control, data, workspace ); + Compute_Preconditioner_QEq( system, control, data, workspace, mpi_data ); } Extrapolate_Charges_QEq( system, control, data, workspace ); switch ( control->cm_solver_type ) { - case CG_S: - iters = dual_CG( system, control, workspace, data, mpi_data, - &workspace->H, workspace->b, control->cm_solver_q_err, - workspace->x, (control->cm_solver_pre_comp_refactor > 0 - && (data->step - data->prev_steps) - % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ); - break; + case CG_S: + iters = dual_CG( system, control, workspace, data, mpi_data, + &workspace->H, workspace->b, control->cm_solver_q_err, + workspace->x, (control->cm_solver_pre_comp_refactor > 0 + && (data->step - data->prev_steps) + % control->cm_solver_pre_comp_refactor == 0) ? TRUE : FALSE ); + break; - case GMRES_S: - case GMRES_H_S: - case SDM_S: - case BiCGStab_S: - default: - fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" ); - exit( INVALID_INPUT ); - break; + case GMRES_S: + case GMRES_H_S: + case SDM_S: + case BiCGStab_S: + default: + fprintf( stderr, "[ERROR] Unsupported solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; } #if defined(LOG_PERFORMANCE) @@ -856,24 +864,24 @@ static void EE( reax_system * const system, control_params * const control, if ( control->cm_solver_pre_comp_refactor > 0 && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ) { - Setup_Preconditioner_EE( system, control, data, workspace ); + Setup_Preconditioner_EE( system, control, data, workspace, mpi_data ); - Compute_Preconditioner_EE( system, control, data, workspace ); + Compute_Preconditioner_EE( system, control, data, workspace, mpi_data ); } Extrapolate_Charges_EE( system, control, data, workspace ); switch ( control->cm_solver_type ) { - case GMRES_S: - case GMRES_H_S: - case CG_S: - case SDM_S: - case BiCGStab_S: - default: - fprintf( stderr, "[ERROR] Unrecognized EE solver selection. Terminating...\n" ); - exit( INVALID_INPUT ); - break; + case GMRES_S: + case GMRES_H_S: + case CG_S: + case SDM_S: + case BiCGStab_S: + default: + fprintf( stderr, "[ERROR] Unrecognized EE solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; } data->timing.cm_solver_iters += iters; @@ -903,24 +911,24 @@ static void ACKS2( reax_system * const system, control_params * const control, if ( control->cm_solver_pre_comp_refactor > 0 && ((data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0) ) { - Setup_Preconditioner_ACKS2( system, control, data, workspace ); + Setup_Preconditioner_ACKS2( system, control, data, workspace, mpi_data ); - Compute_Preconditioner_ACKS2( system, control, data, workspace ); + Compute_Preconditioner_ACKS2( system, control, data, workspace, mpi_data ); } Extrapolate_Charges_EE( system, control, data, workspace ); switch ( control->cm_solver_type ) { - case GMRES_S: - case GMRES_H_S: - case CG_S: - case SDM_S: - case BiCGStab_S: - default: - fprintf( stderr, "[ERROR] Unrecognized ACKS2 solver selection. Terminating...\n" ); - exit( INVALID_INPUT ); - break; + case GMRES_S: + case GMRES_H_S: + case CG_S: + case SDM_S: + case BiCGStab_S: + default: + fprintf( stderr, "[ERROR] Unrecognized ACKS2 solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; } data->timing.cm_solver_iters += iters; @@ -938,21 +946,21 @@ void Compute_Charges( reax_system * const system, control_params * const control { switch ( control->charge_method ) { - case QEQ_CM: - QEq( system, control, data, workspace, out_control, mpi_data ); - break; + case QEQ_CM: + QEq( system, control, data, workspace, out_control, mpi_data ); + break; - case EE_CM: - EE( system, control, data, workspace, out_control, mpi_data ); - break; + case EE_CM: + EE( system, control, data, workspace, out_control, mpi_data ); + break; - case ACKS2_CM: - ACKS2( system, control, data, workspace, out_control, mpi_data ); - break; + case ACKS2_CM: + ACKS2( system, control, data, workspace, out_control, mpi_data ); + break; - default: - fprintf( stderr, "[ERROR] Invalid charge method. Terminating...\n" ); - exit( UNKNOWN_OPTION ); - break; + default: + fprintf( stderr, "[ERROR] Invalid charge method. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + break; } } diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index d837d03bf7647ff5c75c5a05e55e2322f2abeb97..33c2c6a851b5ff67f12ae2527b8b29b488425393 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -29,13 +29,19 @@ #include "vector.h" #if defined(HAVE_CUDA) && defined(DEBUG) - #include "cuda/cuda_validation.h" +#include "cuda/cuda_validation.h" #endif #if defined(CG_PERFORMANCE) real t_start, t_elapsed, matvec_time, dot_time; #endif +enum preconditioner_type +{ + LEFT = 0, + RIGHT = 1, +}; + static void dual_Sparse_MatVec( const sparse_matrix * const A, const rvec2 * const x, rvec2 * const b, const int N ) @@ -61,23 +67,23 @@ static void dual_Sparse_MatVec( const sparse_matrix * const A, #if defined(HALF_LIST) for ( k = si + 1; k < A->end[i]; ++k ) #else - for ( k = si; k < A->end[i]; ++k ) + for ( k = si; k < A->end[i]; ++k ) #endif - { - j = A->entries[k].j; - H = A->entries[k].val; + { + j = A->entries[k].j; + H = A->entries[k].val; - b[i][0] += H * x[j][0]; - b[i][1] += H * x[j][1]; + b[i][0] += H * x[j][0]; + b[i][1] += H * x[j][1]; #if defined(HALF_LIST) - // comment out for tryQEq - //if( j < A->n ) { - b[j][0] += H * x[i][0]; - b[j][1] += H * x[i][1]; - //} + // comment out for tryQEq + //if( j < A->n ) { + b[j][0] += H * x[i][0]; + b[j][1] += H * x[i][1]; + //} #endif - } + } } } @@ -92,20 +98,1019 @@ real diag_pre_comp( const reax_system * const system, real * const Hdia_inv ) for ( i = 0; i < system->n; ++i ) { -// if ( H->entries[H->start[i + 1] - 1].val != 0.0 ) -// { -// Hdia_inv[i] = 1.0 / H->entries[H->start[i + 1] - 1].val; - Hdia_inv[i] = 1.0 / system->reax_param.sbp[ system->my_atoms[i].type ].eta; -// } -// else -// { -// Hdia_inv[i] = 1.0; -// } + // if ( H->entries[H->start[i + 1] - 1].val != 0.0 ) + // { + // Hdia_inv[i] = 1.0 / H->entries[H->start[i + 1] - 1].val; + Hdia_inv[i] = 1.0 / system->reax_param.sbp[ system->my_atoms[i].type ].eta; + // } + // else + // { + // Hdia_inv[i] = 1.0; + // } } return Get_Timing_Info( start ); } +int compare_dbls( const void* arg1, const void* arg2 ) +{ + int ret; + double a1, a2; + + a1 = *(double *) arg1; + a2 = *(double *) arg2; + + if ( a1 < a2 ) + { + ret = -1; + } + else if (a1 == a2) + { + ret = 0; + } + else + { + ret = 1; + } + + return ret; +} + +void qsort_dbls( double *array, int array_len ) +{ + qsort( array, (size_t)array_len, sizeof(double), + compare_dbls ); +} + +int find_bucket( double *list, int len, double a ) +{ + int s, e, m; + + if ( a > list[len - 1] ) + { + return len; + } + + s = 0; + e = len - 1; + + while ( s < e ) + { + m = (s + e) / 2; + + if ( list[m] < a ) + { + s = m + 1; + } + else + { + e = m; + } + } + + return s; +} + +void setup_sparse_approx_inverse( reax_system *system, storage *workspace, mpi_datatypes* mpi_data, + sparse_matrix *A, sparse_matrix **A_spar_patt, const int nprocs, const double filter ) +{ + + int i, bin, total, pos; + int n, m, s_local, s, n_local; + int target_proc; + int k; + int pj, size; + int left, right, p, turn; + + real threshold, pivot, tmp; + real *input_array; + real *samplelist_local, *samplelist; + real *pivotlist; + real *bucketlist_local, *bucketlist; + real *local_entries; + + int *scounts_local, *scounts; + int *dspls_local, *dspls; + int *bin_elements; + + MPI_Comm comm; + + comm = mpi_data->world; + + + if ( *A_spar_patt == NULL ) + { + Allocate_Matrix( A_spar_patt, A->n, A->n, A->m ); + } + else if ( ((*A_spar_patt)->m) < (A->m) ) + { + Deallocate_Matrix( *A_spar_patt ); + Allocate_Matrix( A_spar_patt, A->n, A->n, A->m ); + } + + m = 0; + for( i = 0; i < A->n; ++i ) + { + m += A->end[i] - A->start[i]; + } + + local_entries = (real *) malloc ( sizeof(real) * m ); + m = 0; + for( i = 0; i < A->n; ++i ) + { + for( pj = A->start[i]; pj < A->end[i]; ++pj ) + { + local_entries[m++] = A->entries[pj].val; + } + } + + /* the sample ratio is 10% */ + n_local = m/10.0; + + input_array = (real *) malloc( sizeof(real) * n_local ); + + for ( i = 0; i < n_local ; i++ ) + { + input_array[i] = local_entries[rand( ) % m]; + } + + s_local = (int) (12.0 * log2(n_local*nprocs)); + + MPI_Reduce(&n_local, &n, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm); + MPI_Reduce(&s_local, &s, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm); + + samplelist_local = (real *) malloc( sizeof(real) * s_local ); + if ( system->my_rank == MASTER_NODE ) + { + samplelist = (real *) malloc( sizeof(real) * s ); + } + + for ( i = 0; i < s_local; i++) + { + samplelist_local[i] = input_array[rand( ) % n_local]; + } + + + /* gather samples at the root process */ + MPI_Gather( samplelist_local, s_local, MPI_DOUBLE, samplelist, s_local, MPI_DOUBLE, MASTER_NODE, comm ); + + + /* sort samples at the root process and select pivots */ + pivotlist = (real *) malloc( sizeof(real) * (nprocs - 1) ); + if ( system->my_rank == MASTER_NODE ) + { + qsort_dbls( samplelist, s ); + + for ( i = 1; i < nprocs; i++) + { + pivotlist[i - 1] = samplelist[(i * s) / nprocs]; + } + } + + + /* broadcast pivots */ + MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, comm ); + + + /* count num. bin elements for each processor, uniform bin sizes */ + scounts_local = (int *) malloc( sizeof(int) * nprocs ); + for ( i = 0; i < n_local; i++ ) + { + pos = find_bucket( pivotlist, nprocs - 1, input_array[i] ); + scounts_local[pos]++; + } + + scounts = (int *) malloc( sizeof(int) * nprocs ); + bin_elements = (int *) malloc( sizeof(int) * nprocs ); + + for ( i = 0; i < nprocs; ++i ) + { + bin_elements[i] = scounts_local[i]; + scounts[i] = scounts_local[i]; + } + + /* compute displacements for MPI comm */ + dspls_local = (int *) malloc( sizeof(int) * nprocs ); + + dspls_local[0] = 0; + for ( i = 0; i < nprocs - 1; i++ ) + { + dspls_local[i + 1] = dspls_local[i] + scounts_local[i]; + } + + /* bin elements */ + bucketlist_local = (real *) malloc( sizeof(real) * n_local ); + + for ( i = 0; i < n_local; ++i ) + { + bin = find_bucket( pivotlist, nprocs - 1, input_array[i] ); + pos = dspls_local[bin] + scounts_local[bin] - bin_elements[bin]; + bucketlist_local[pos] = input_array[i]; + bin_elements[bin]--; + } + + /* determine counts for elements per process */ + MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, comm ); + + /*find the target process*/ + target_proc = 0; + total = 0; + k = n*filter; + + for(i = nprocs; i >= 0; --i ) + { + if( total + scounts[i] >= k ) + { + /* global k becomes local k*/ + k -= total; + target_proc = i; + break; + } + total += scounts[i]; + } + + /* send local buckets to target processor for quickselect*/ + dspls = (int *) malloc( nprocs * sizeof(int) ); + + MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts, 1, MPI_INT, target_proc, comm ); + + if ( system->my_rank == target_proc ) + { + dspls[0] = 0; + for ( i = 0; i < nprocs - 1; ++i ) + { + dspls[i + 1] = dspls[i] + scounts[i]; + } + } + + if( system->my_rank == target_proc) + { + bucketlist = (real *) malloc( sizeof(real) * scounts[target_proc] ); + } + + MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE, + bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, comm); + + /* apply quick select algorithm at the target process*/ + if( system->my_rank == target_proc) + { + left = 0; + right = scounts[target_proc]; + + turn = 0; + while( k ) { + + p = left; + turn = 1 - turn; + if( turn == 1) + { + pivot = bucketlist[right]; + } + else + { + pivot = bucketlist[left]; + } + for( i = left + 1 - turn; i <= right-turn; ++i ) + { + if( bucketlist[i] > pivot ) + { + tmp = bucketlist[i]; + bucketlist[i] = bucketlist[p]; + bucketlist[p] = tmp; + p++; + } + } + if(turn == 1) + { + tmp = bucketlist[p]; + bucketlist[p] = bucketlist[right]; + bucketlist[right] = tmp; + } + else + { + tmp = bucketlist[p]; + bucketlist[p] = bucketlist[left]; + bucketlist[left] = tmp; + } + + if( p == k - 1) + { + threshold = bucketlist[p]; + break; + } + else if( p > k - 1 ) + { + right = p - 1; + } + else + { + left = p + 1; + } + } + if(threshold < 1.000000) + { + threshold = 1.000001; + } + } + + /*broadcast the filtering value*/ + MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, comm ); + + /*build entries of that pattern*/ + for ( i = 0; i < A->n; ++i ) + { + (*A_spar_patt)->start[i] = A->start[i]; + size = A->start[i]; + + for ( pj = A->start[i]; pj < A->end[i]; ++pj ) + { + if ( ( A->entries[pj].val >= threshold ) || ( A->entries[pj].j == i ) ) + { + (*A_spar_patt)->entries[size].val = A->entries[pj].val; + (*A_spar_patt)->entries[size].j = A->entries[pj].j; + size++; + } + } + (*A_spar_patt)->end[i] = size; + } + (*A_spar_patt)->start[A->n] = A->start[A->n]; + /*TODO: check if end[N] is set equal to NNZ as start[N]*/ + (*A_spar_patt)->end[A->n] = A->end[A->n]; +} + +void sparse_approx_inverse(reax_system *system, storage *workspace, + mpi_datatypes* mpi_data, const sparse_matrix * const A, + const sparse_matrix * const A_spar_patt, sparse_matrix ** A_app_inv ) +{ + int i, k, pj, j_temp, identity_pos; + int N, M, d_i, d_j; + lapack_int m, n, nrhs, lda, ldb, info; + int *pos_x, *pos_y; + real *e_j, *dense_matrix; + char *X, *Y; + + int cnt; + reax_atom *atom; + int *mark, *row_needed, *row_nnz; + int **j_list; + real **val_list; + + int d; + mpi_out_data *out_bufs; + MPI_Comm comm; + MPI_Request req1, req2, req3, req4; + MPI_Status stat1, stat2, stat3, stat4; + neighbor_proc *nbr1, *nbr2; + int *j_send, *j_recv1, *j_recv2; + real *val_send, *val_recv1, *val_recv2; + + + + mark = (int *) malloc( sizeof(int) * system->N ); + row_needed = (int *) malloc( sizeof(int) * system->N ); + row_nnz = (int *) malloc( sizeof(int) * system->N ); + + j_list = (int **) malloc( sizeof(int *) * system->N ); + val_list = (real **) malloc( sizeof(real *) * system->N ); + + for ( i = 0; i < system->N; ++i ) + { + mark[ i ] = -1; + row_needed[ i ] = -1; + } + + /* mark the atoms that already have their row stored in the local matrix */ + for ( i = 0; i < system->n; ++i ) + { + atom = &system->my_atoms[i]; + mark[ atom->orig_id ] = i; + } + + /*find the atoms that are not marked but needed, + * meaning we need to communicate their row*/ + for ( i = 0; i < A_spar_patt->n; ++i ) + { + for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj ) + { + atom = &system->my_atoms[ A_spar_patt->entries[pj].j ]; + + if( mark[ atom->orig_id ] == -1) + { + row_needed[ A_spar_patt->entries[pj].j ] = atom->orig_id; + } + } + } + + /* distribute the row numbers that is needed for dense matrix */ + Dist( system, mpi_data, row_needed, INT_PTR_TYPE, MPI_INT ); + + /* fill in the nnz of the lines that will be collected by other processes */ + for( i = 0; i < system->N; ++i ) + { + if( row_needed[i] !=-1 && mark[ row_needed[i] ] != -1) + { + row_nnz[i] = A->end[ mark[ row_needed[i] ] ] - A->start[ mark[ row_needed[i] ] ]; + } + } + + /* announce the nnz's in each row to allocota space */ + Coll( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT ); + + comm = mpi_data->comm_mesh3D; + out_bufs = mpi_data->out_buffers; + for ( d = 2; d >= 0; --d ) + { + /* initiate recvs */ + nbr1 = &system->my_nbrs[2 * d]; + + if ( out_bufs[2 * d].cnt ) + { + cnt = 0; + for( i = 0; i < out_bufs[2 * d].cnt; ++i ) + { + cnt += row_nnz[ out_bufs[2 * d].index[i] ]; + } + + j_recv1 = (int *) malloc( sizeof(int) * cnt ); + val_recv1 = (real *) malloc( sizeof(real) * cnt ); + + MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, comm, &req1 ); + MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, comm, &req2 ); + } + + nbr2 = &system->my_nbrs[2 * d + 1]; + + if ( out_bufs[2 * d + 1].cnt ) + { + cnt = 0; + for( i = 0; i < out_bufs[2 * d+1].cnt; ++i ) + { + cnt += row_nnz[ out_bufs[2 * d+1].index[i] ]; + } + + j_recv2 = (int *) malloc( sizeof(int) * cnt ); + val_recv2 = (real *) malloc( sizeof(real) * cnt ); + + MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, comm, &req3 ); + MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, comm, &req4 ); + } + + /* send both messages in dimension d */ + if ( nbr1->atoms_cnt ) + { + cnt = 0; + for( i = nbr1->atoms_str; i < nbr1->atoms_str + nbr1->atoms_cnt; ++i) + { + atom = &system->my_atoms[i]; + + if(mark[ atom->orig_id ] != -1) + { + cnt += A->end[ mark[ atom->orig_id ] ] - A->start[ mark[ atom->orig_id ] ]; + } + else + { + cnt += row_nnz[i]; + } + } + + + j_send = (int *) malloc( sizeof(int) * cnt ); + val_send = (real *) malloc( sizeof(real) * cnt ); + + cnt = 0; + for( i = nbr1->atoms_str; i < nbr1->atoms_str + nbr1->atoms_cnt; ++i) + { + atom = &system->my_atoms[i]; + + if(mark[ atom->orig_id ] != -1) + { + for( pj = A->start[ mark[ atom->orig_id ] ]; pj < A->end[ mark[ atom->orig_id ] ]; ++pj) + { + j_send[cnt] = A->entries[pj].j; + val_send[cnt] = A->entries[pj].val; + cnt++; + } + } + else + { + for( pj = 0; pj < row_nnz[i]; ++pj) + { + j_send[cnt] = j_list[i][pj]; + val_send[cnt] = val_list[i][pj]; + cnt++; + } + } + + } + + MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, comm ); + MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, comm ); + + } + + if ( nbr2->atoms_cnt ) + { + cnt = 0; + for( i = nbr2->atoms_str; i < nbr2->atoms_str + nbr2->atoms_cnt; ++i) + { + atom = &system->my_atoms[i]; + + if(mark[ atom->orig_id ] != -1) + { + cnt += A->end[ mark[ atom->orig_id ] ] - A->start[ mark[ atom->orig_id ] ]; + } + else + { + cnt += row_nnz[i]; + } + } + + j_send = (int *) malloc( sizeof(int) * cnt ); + val_send = (real *) malloc( sizeof(real) * cnt ); + + cnt = 0; + for( i = nbr2->atoms_str; i < nbr2->atoms_str + nbr2->atoms_cnt; ++i) + { + atom = &system->my_atoms[i]; + + if(mark[ atom->orig_id ] != -1) + { + for( pj = A->start[ mark[ atom->orig_id ] ]; pj < A->end[ mark[ atom->orig_id ] ]; ++pj) + { + j_send[cnt] = A->entries[pj].j; + val_send[cnt] = A->entries[pj].val; + cnt++; + } + } + else + { + for( pj = 0; pj < row_nnz[i]; ++pj) + { + j_send[cnt] = j_list[i][pj]; + val_send[cnt] = val_list[i][pj]; + cnt++; + } + } + } + + MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, comm ); + MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, comm ); + } + + if ( out_bufs[2 * d].cnt ) + { + MPI_Wait( &req1, &stat1 ); + MPI_Wait( &req2, &stat2 ); + + cnt = 0; + for( i = 0; i < out_bufs[2 * d].cnt; ++i ) + { + j_list[ out_bufs[2 * d].index[i] ] = (int *) malloc( sizeof(int) * row_nnz[ out_bufs[2 * d].index[i] ] ); + val_list[ out_bufs[2 * d].index[i] ] = (real *) malloc( sizeof(real) * row_nnz[ out_bufs[2 * d].index[i] ] ); + + for( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj) + { + j_list[ out_bufs[2 * d].index[i] ][pj] = j_recv1[cnt]; + val_list[ out_bufs[2 * d].index[i] ][pj] = val_recv1[cnt]; + cnt++; + } + } + } + + if ( out_bufs[2 * d + 1].cnt ) + { + MPI_Wait( &req3, &stat3 ); + MPI_Wait( &req4, &stat4 ); + + cnt = 0; + for( i = 0; i < out_bufs[2 * d + 1].cnt; ++i ) + { + for( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj) + { + j_list[ out_bufs[2 * d + 1].index[i] ][pj] = j_recv2[cnt]; + val_list[ out_bufs[2 * d + 1].index[i] ][pj] = val_recv2[cnt]; + cnt++; + } + } + } + } + + (*A_app_inv)->start[(*A_app_inv)->n] = A_spar_patt->start[A_spar_patt->n]; + + + X = (char *) malloc( sizeof(char) * A->n ); + Y = (char *) malloc( sizeof(char) * A->n ); + pos_x = (int *) malloc( sizeof(int) * A->n ); + pos_y = (int *) malloc( sizeof(int) * A->n ); + + for ( i = 0; i < A->n; ++i ) + { + X[i] = 0; + Y[i] = 0; + pos_x[i] = 0; + pos_y[i] = 0; + } + + for ( i = 0; i < A_spar_patt->n; ++i ) + { + N = 0; + M = 0; + + /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */ + for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj ) + { + + j_temp = A_spar_patt->entries[pj].j; + + Y[j_temp] = 1; + pos_y[j_temp] = N; + ++N; + + /* for each of those indices + * search through the row of full A of that index */ + + /* the case where the local matrix has that index's row */ + if(mark[j_temp] != -1) + { + for ( k = A->start[ mark[j_temp] ]; k < A->end[ mark[j_temp] ]; ++k ) + { + /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */ + X[A->entries[k].j] = 1; + } + } + + /* the case where we communicated that index's row */ + else + { + for ( k = 0; k < row_nnz[j_temp]; ++k ) + { + /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */ + X[ j_list[j_temp][k] ] = 1; + } + } + } + + /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */ + identity_pos = M; + for ( k = 0; k < A->n; k++) + { + if ( X[k] != 0 ) + { + pos_x[M] = k; + if ( k == i ) + { + identity_pos = M; + } + ++M; + } + } + + /* allocate memory for NxM dense matrix */ + dense_matrix = (real *) malloc( sizeof(real) * N * M ); + + /* fill in the entries of dense matrix */ + for ( d_i = 0; d_i < M; ++d_i) + { + /* all rows are initialized to zero */ + for ( d_j = 0; d_j < N; ++d_j ) + { + dense_matrix[d_i * N + d_j] = 0.0; + } + /* change the value if any of the column indices is seen */ + + /* it is in the original list */ + if( mark[ pos_x[d_i] ] != -1) + { + for ( d_j = A->start[ mark[ pos_x[d_i] ] ]; d_j < A->end[ mark[ pos_x[d_i] ] ]; ++d_j ) + { + if ( Y[A->entries[d_j].j] == 1 ) + { + dense_matrix[d_i * N + pos_y[A->entries[d_j].j]] = A->entries[d_j].val; + } + } + } + /* communicated */ + else + { + for ( d_j = 0; d_j < row_nnz[ pos_x[d_i] ]; ++d_j ) + { + if ( Y[ j_list[ pos_x[d_i] ][d_j] ] == 1 ) + { + dense_matrix[d_i * N + pos_y[ j_list[ pos_x[d_i] ][d_j] ]] = val_list[ pos_x[d_i] ][d_j]; + } + } + } + + } + + /* create the right hand side of the linear equation + * that is the full column of the identity matrix*/ + e_j = (real *) malloc( sizeof(real) * M ); + + for ( k = 0; k < M; ++k ) + { + e_j[k] = 0.0; + } + e_j[identity_pos] = 1.0; + + /* Solve the overdetermined system AX = B through the least-squares problem: + * * min ||B - AX||_2 */ + m = M; + n = N; + nrhs = 1; + lda = N; + ldb = nrhs; + info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda, + e_j, ldb ); + + /* Check for the full rank */ + if ( info > 0 ) + { + /*fprintf( stderr, "The diagonal element %i of the triangular factor ", info ); + * fprintf( stderr, "of A is zero, so that A does not have full rank;\n" ); + * fprintf( stderr, "the least squares solution could not be computed.\n" ); + * exit( INVALID_INPUT );*/ + + /* TODO: print some error and exit */ + } + + /* Print least squares solution */ + /* print_matrix( "Least squares solution", n, nrhs, b, ldb ); */ + + /* accumulate the resulting vector to build A_app_inv */ + (*A_app_inv)->start[i] = A_spar_patt->start[i]; + (*A_app_inv)->end[i] = A_spar_patt->end[i]; + for ( k = A_spar_patt->start[i]; k < A_spar_patt->end[i]; ++k) + { + (*A_app_inv)->entries[k].j = A_spar_patt->entries[k].j; + (*A_app_inv)->entries[k].val = e_j[k - A_spar_patt->start[i]]; + } + + /* empty variables that will be used next iteration */ + free( dense_matrix ); + free( e_j ); + for ( k = 0; k < A->n; ++k ) + { + X[k] = 0; + Y[k] = 0; + pos_x[k] = 0; + pos_y[k] = 0; + } + } + + free( pos_y); + free( pos_x); + free( Y ); + free( X ); +} + +static void diag_pre_app( const real * const Hdia_inv, const real * const y, + real * const x, const int N ) +{ + unsigned int i; + +#ifdef _OPENMP +#pragma omp for schedule(static) +#endif + for ( i = 0; i < N; ++i ) + { + x[i] = y[i] * Hdia_inv[i]; + } +} + +static void apply_preconditioner( const reax_system * const system, const storage * const workspace, + const control_params * const control, const real * const y, real * const x, + const int fresh_pre, const int side ) +{ + int i, si; + fprintf(stdout,"apply_preconditioner working\n"); + fflush(stdout); + /* no preconditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + if ( x != y ) + { + Vector_Copy( x, y, system->n ); + } + } + else + { + switch ( side ) + { + case LEFT: + switch ( control->cm_solver_pre_app_type ) + { + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + diag_pre_app( workspace->Hdia_inv, y, x, system->n ); + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); + break;*/ + case SAI_PC: + Sparse_MatVec( workspace->H_app_inv, y, x ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + diag_pre_app( workspace->Hdia_inv, y, x, system->n ); + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve_level_sched( (static_storage *) workspace, + workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); + break;*/ + case SAI_PC: + Sparse_MatVec( workspace->H_app_inv, y, x ); + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + for ( i = 0; i < workspace->H->n; ++i ) + { + workspace->y_p[i] = y[i]; + } + + permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); + tri_solve_level_sched( (static_storage *) workspace, + workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); + break;*/ + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + // construct D^{-1}_L + if ( fresh_pre == TRUE ) + { + for ( i = 0; i < workspace->L->n; ++i ) + { + si = workspace->L->start[i + 1] - 1; + workspace->Dinv_L[i] = 1.0 / workspace->L->val[si]; + } + } + + jacobi_iter( workspace, workspace->L, workspace->Dinv_L, + y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); + break;*/ + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + } + break; + + case RIGHT: + switch ( control->cm_solver_pre_app_type ) + { + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + if ( x != y ) + { + Vector_Copy( x, y, system->n ); + } + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve( workspace->U, y, x, workspace->U->n, UPPER ); + break;*/ + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + if ( x != y ) + { + Vector_Copy( x, y, system->n ); + } + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve_level_sched( (static_storage *) workspace, + workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); + break;*/ + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + tri_solve_level_sched( (static_storage *) workspace, + workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); + permute_vector( workspace, x, workspace->H->n, TRUE, UPPER ); + break;*/ + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case DIAG_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + /*case ICHOLT_PC: + case ILU_PAR_PC: + case ILUT_PAR_PC: + if ( fresh_pre == TRUE ) + { + for ( i = 0; i < workspace->U->n; ++i ) + { + si = workspace->U->start[i]; + workspace->Dinv_U[i] = 1.0 / workspace->U->val[si]; + } + } + + jacobi_iter( workspace, workspace->U, workspace->Dinv_U, + y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); + break;*/ + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + } + break; + } + } +} int dual_CG( const reax_system * const system, const control_params * const control, const storage * const workspace, const simulation_data * const data, @@ -113,17 +1118,29 @@ int dual_CG( const reax_system * const system, const control_params * const cont const sparse_matrix * const H, const rvec2 * const b, const real tol, rvec2 * const x, const int fresh_pre ) { - int i, j, n, N, iters; + + fprintf(stdout,"dual_cg working\n"); + fflush(stdout); + int i, j, k, n, N, iters; rvec2 tmp, alpha, beta; rvec2 my_sum, norm_sqr, b_norm, my_dot; rvec2 sig_old, sig_new; MPI_Comm comm; + real *d, *r, *p, *z; + + n = system->n; N = system->N; comm = mpi_data->world; iters = 0; + d = (real *) malloc( sizeof(real) * n); + r = (real *) malloc( sizeof(real) * n); + p = (real *) malloc( sizeof(real) * n); + z = (real *) malloc( sizeof(real) * n); + + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -145,7 +1162,7 @@ int dual_CG( const reax_system * const system, const control_params * const cont dual_Sparse_MatVec( H, x, workspace->q2, N ); -// if (data->step > 0) return; + // if (data->step > 0) return; #if defined(HALF_LIST) // tryQEq @@ -165,8 +1182,36 @@ int dual_CG( const reax_system * const system, const control_params * const cont workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; /* apply diagonal pre-conditioner */ - workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; - workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + /*workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];*/ + } + + /* appl preconditioner for both systems */ + for ( j = 0; j < n; ++j ) + { + r[j] = workspace->r2[j][0]; + d[j] = workspace->d2[j][0]; + } + + apply_preconditioner( system, workspace, control, r, d, fresh_pre, LEFT ); + + for ( j = 0; j < n; ++j ) + { + workspace->d2[j][0] = d[j]; + } + + + for ( j = 0; j < n; ++j ) + { + r[j] = workspace->r2[j][1]; + d[j] = workspace->d2[j][1]; + } + + apply_preconditioner( system, workspace, control, r, d, fresh_pre, LEFT ); + + for ( j = 0; j < n; ++j ) + { + workspace->d2[j][1] = d[j]; } //print_host_rvec2 (workspace->r2, n); @@ -244,9 +1289,41 @@ int dual_CG( const reax_system * const system, const control_params * const cont workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; /* apply diagonal pre-conditioner */ - workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; - workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; + /*workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; + workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];*/ + } + + /* appl preconditioner for both systems */ + for ( j = 0; j < n; ++j ) + { + r[j] = workspace->r2[j][0]; + z[j] = workspace->p2[j][0]; + } + + apply_preconditioner( system, workspace, control, r, z, fresh_pre, LEFT ); + + for ( j = 0; j < n; ++j ) + { + workspace->p2[j][0] = z[j]; + } + + + for ( j = 0; j < n; ++j ) + { + r[j] = workspace->r2[j][1]; + z[j] = workspace->p2[j][1]; + } + + apply_preconditioner( system, workspace, control, r, z, fresh_pre, LEFT ); + + for ( j = 0; j < n; ++j ) + { + workspace->p2[j][1] = z[j]; + } + + for ( j = 0; j < n; ++j ) + { /* dot product: r.p */ my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0]; my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1]; @@ -338,18 +1415,18 @@ const void Sparse_MatVec( const sparse_matrix * const A, const real * const x, #if defined(HALF_LIST) for ( k = si + 1; k < A->end[i]; ++k ) #else - for ( k = si; k < A->end[i]; ++k ) + for ( k = si; k < A->end[i]; ++k ) #endif - { - j = A->entries[k].j; - val = A->entries[k].val; + { + j = A->entries[k].j; + val = A->entries[k].val; - b[i] += val * x[j]; + b[i] += val * x[j]; #if defined(HALF_LIST) - //if( j < A->n ) // comment out for tryQEq - b[j] += val * x[i]; + //if( j < A->n ) // comment out for tryQEq + b[j] += val * x[i]; #endif - } + } } } @@ -360,9 +1437,18 @@ int CG( const reax_system * const system, const control_params * const control, const sparse_matrix * const H, const real * const b, const real tol, real * const x, const int fresh_pre ) { + fprintf(stdout,"CG working\n"); + fflush(stdout); int i, j; real tmp, alpha, beta, b_norm; real sig_old, sig_new, sig0; + //real *d, *r, *p, *z; + + /*d = workspace->d; + r = workspace->r; + p = workspace->q; + z = workspace->p;*/ + Dist( system, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE ); Sparse_MatVec( H, x, workspace->q, system->N ); @@ -382,10 +1468,12 @@ int CG( const reax_system * const system, const control_params * const control, Vector_Sum( workspace->r , 1.0, b, -1.0, workspace->q, system->n ); // preconditioner - for ( j = 0; j < system->n; ++j ) - { - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - } + /*for ( j = 0; j < system->n; ++j ) + { + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + }*/ + apply_preconditioner( system, workspace, control, workspace->r, workspace->d, fresh_pre, LEFT ); + apply_preconditioner( system, workspace, control, workspace->d, workspace->p, fresh_pre, RIGHT ); b_norm = Parallel_Norm( b, system->n, mpi_data->world ); sig_new = Parallel_Dot( workspace->r, workspace->d, system->n, mpi_data->world ); @@ -421,10 +1509,12 @@ int CG( const reax_system * const system, const control_params * const control, Vector_Add( workspace->r, -alpha, workspace->q, system->n ); /* preconditioner */ - for ( j = 0; j < system->n; ++j ) - { - workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; - } + /*for ( j = 0; j < system->n; ++j ) + { + workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + }*/ + apply_preconditioner( system, workspace, control, workspace->r, workspace->d, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, workspace->d, workspace->p, FALSE, RIGHT ); sig_old = sig_new; sig_new = Parallel_Dot( workspace->r, workspace->p, system->n, mpi_data->world );