/*---------------------------------------------------------------------- PuReMD - Purdue ReaxFF Molecular Dynamics Program Copyright (2010) Purdue University Hasan Metin Aktulga, haktulga@cs.purdue.edu Joseph Fogarty, jcfogart@mail.usf.edu Sagar Pandit, pandit@usf.edu Ananth Y Grama, ayg@cs.purdue.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details: <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ #include "lin_alg.h" #include "allocate.h" #include "basic_comm.h" #include "comm_tools.h" #include "io_tools.h" #include "tool_box.h" #include "vector.h" /* Intel MKL */ #if defined(HAVE_LAPACKE_MKL) #include "mkl.h" /* reference LAPACK */ #elif defined(HAVE_LAPACKE) #include "lapacke.h" #endif typedef struct { unsigned int j; real val; } sparse_matrix_entry; enum preconditioner_type { LEFT = 0, RIGHT = 1, }; static int compare_matrix_entry( const void * const v1, const void * const v2 ) { return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; } /* Routine used for sorting nonzeros within a sparse matrix row; * internally, a combination of qsort and manual sorting is utilized * * A: sparse matrix for which to sort nonzeros within a row, stored in CSR format */ void Sort_Matrix_Rows( sparse_matrix * const A ) { unsigned int i, pj, si, ei, temp_size; sparse_matrix_entry *temp; temp = NULL; temp_size = 0; /* sort each row of A using column indices */ for ( i = 0; i < A->n; ++i ) { si = A->start[i]; ei = A->end[i]; if ( temp == NULL ) { temp = smalloc( sizeof(sparse_matrix_entry) * (ei - si), "Sort_Matrix_Rows::temp" ); temp_size = ei - si; } else if ( temp_size < ei - si ) { sfree( temp, "Sort_Matrix_Rows::temp" ); temp = smalloc( sizeof(sparse_matrix_entry) * (ei - si), "Sort_Matrix_Rows::temp" ); temp_size = ei - si; } for ( pj = 0; pj < (ei - si); ++pj ) { temp[pj].j = A->j[si + pj]; temp[pj].val = A->val[si + pj]; } /* polymorphic sort in standard C library using column indices */ qsort( temp, ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry ); for ( pj = 0; pj < (ei - si); ++pj ) { A->j[si + pj] = temp[pj].j; A->val[si + pj] = temp[pj].val; } } sfree( temp, "Sort_Matrix_Rows::temp" ); } static int compare_dbls( const void* arg1, const void* arg2 ) { int ret; double a1, a2; a1 = *(double *) arg1; a2 = *(double *) arg2; if ( a1 < a2 ) { ret = -1; } else if (a1 == a2) { ret = 0; } else { ret = 1; } return ret; } static void qsort_dbls( double *array, int array_len ) { qsort( array, (size_t) array_len, sizeof(double), compare_dbls ); } static int find_bucket( double *list, int len, double a ) { int s, e, m; if ( len == 0 ) { return 0; } if ( a > list[len - 1] ) { return len; } s = 0; e = len - 1; while ( s < e ) { m = (s + e) / 2; if ( list[m] < a ) { s = m + 1; } else { e = m; } } return s; } /* Compute diagonal inverese (Jacobi) preconditioner * * H: matrix used to compute preconditioner, in CSR format * Hdia_inv: computed diagonal inverse preconditioner */ void jacobi( sparse_matrix const * const H, real * const Hdia_inv ) { unsigned int i, pj; if ( H->format == SYM_HALF_MATRIX ) { for ( i = 0; i < H->n; ++i ) { if ( FABS( H->val[H->start[i]] ) > 1.0e-15 ) { Hdia_inv[i] = 1.0 / H->val[H->start[i]]; } else { Hdia_inv[i] = 1.0; } } } else if ( H->format == SYM_FULL_MATRIX || H->format == FULL_MATRIX ) { for ( i = 0; i < H->n; ++i ) { for ( pj = H->start[i]; pj < H->start[i + 1]; ++pj ) { if ( H->j[pj] == i ) { if ( FABS( H->val[H->start[i]] ) > 1.0e-15 ) { Hdia_inv[i] = 1.0 / H->val[pj]; } else { Hdia_inv[i] = 1.0; } break; } } } } } /* Apply diagonal inverse (Jacobi) preconditioner to system residual * * Hdia_inv: diagonal inverse preconditioner (constructed using H) * y: current residuals * x: preconditioned residuals * N: dimensions of preconditioner and vectors (# rows in H) */ static void dual_jacobi_app( const real * const Hdia_inv, const rvec2 * const y, rvec2 * const x, const int N ) { unsigned int i; for ( i = 0; i < N; ++i ) { x[i][0] = y[i][0] * Hdia_inv[i]; x[i][1] = y[i][1] * Hdia_inv[i]; } } /* Apply diagonal inverse (Jacobi) preconditioner to system residual * * Hdia_inv: diagonal inverse preconditioner (constructed using H) * y: current residual * x: preconditioned residual * N: dimensions of preconditioner and vectors (# rows in H) */ static void jacobi_app( const real * const Hdia_inv, const real * const y, real * const x, const int N ) { unsigned int i; for ( i = 0; i < N; ++i ) { x[i] = y[i] * Hdia_inv[i]; } } /* Local arithmetic portion of dual sparse matrix-dense vector multiplication Ax = b * * A: sparse matrix, 1D partitioned row-wise * x: two dense vectors * b (output): two dense vectors * N: number of entries in both vectors in b (must be equal) */ static void Dual_Sparse_MatVec_local( sparse_matrix const * const A, rvec2 const * const x, rvec2 * const b, int N ) { int i, j, k, si, num_rows; real val; for ( i = 0; i < N; ++i ) { b[i][0] = 0.0; b[i][1] = 0.0; } #if defined(NEUTRAL_TERRITORY) num_rows = A->NT; if ( A->format == SYM_HALF_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; /* diagonal only contributes once */ if ( i < A->n ) { b[i][0] += A->val[si] * x[i][0]; b[i][1] += A->val[si] * x[i][1]; k = si + 1; } /* zeros on the diagonal for i >= A->n, * so skip the diagonal multplication step as zeros * are not stored (idea: keep the NNZ's the same * for full shell and neutral territory half-stored * charge matrices to make debugging easier) */ else { k = si; } for ( ; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i][0] += val * x[j][0]; b[i][1] += val * x[j][1]; b[j][0] += val * x[i][0]; b[j][1] += val * x[i][1]; } } } else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; for ( k = si; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i][0] += val * x[j][0]; b[i][1] += val * x[j][1]; } } } #else num_rows = A->n; if ( A->format == SYM_HALF_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; /* diagonal only contributes once */ b[i][0] += A->val[si] * x[i][0]; b[i][1] += A->val[si] * x[i][1]; for ( k = si + 1; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i][0] += val * x[j][0]; b[i][1] += val * x[j][1]; b[j][0] += val * x[i][0]; b[j][1] += val * x[i][1]; } } } else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; for ( k = si; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i][0] += val * x[j][0]; b[i][1] += val * x[j][1]; } } } #endif } /* Communications for sparse matrix-dense vector multiplication Ax = b * * system: * control: * mpi_data: * x: dense vector * buf_type: data structure type for x * mpi_type: MPI_Datatype struct for communications * * returns: communication time */ static void Sparse_MatVec_Comm_Part1( const reax_system * const system, const control_params * const control, mpi_datatypes * const mpi_data, void const * const x, int buf_type, MPI_Datatype mpi_type ) { /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ Dist( system, mpi_data, x, buf_type, mpi_type ); } /* Local arithmetic portion of sparse matrix-dense vector multiplication Ax = b * * A: sparse matrix, 1D partitioned row-wise * x: dense vector * b (output): dense vector * N: number of entries in b */ static void Sparse_MatVec_local( sparse_matrix const * const A, real const * const x, real * const b, int N ) { int i, j, k, si, num_rows; real val; for ( i = 0; i < N; ++i ) { b[i] = 0.0; } #if defined(NEUTRAL_TERRITORY) num_rows = A->NT; if ( A->format == SYM_HALF_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; /* diagonal only contributes once */ if ( i < A->n ) { b[i] += A->val[si] * x[i]; k = si + 1; } /* zeros on the diagonal for i >= A->n, * so skip the diagonal multplication step as zeros * are not stored (idea: keep the NNZ's the same * for full shell and neutral territory half-stored * charge matrices to make debugging easier) */ else { k = si; } for ( ; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i] += val * x[j]; b[j] += val * x[i]; } } } else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; for ( k = si; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i] += val * x[j]; } } } #else num_rows = A->n; if ( A->format == SYM_HALF_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; /* A symmetric, upper triangular portion stored * => diagonal only contributes once */ b[i] += A->val[si] * x[i]; for ( k = si + 1; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i] += val * x[j]; b[j] += val * x[i]; } } } else if ( A->format == SYM_FULL_MATRIX || A->format == FULL_MATRIX ) { for ( i = 0; i < num_rows; ++i ) { si = A->start[i]; for ( k = si; k < A->end[i]; ++k ) { j = A->j[k]; val = A->val[k]; b[i] += val * x[j]; } } } #endif } /* Communications for sparse matrix-dense vector multiplication Ax = b * * system: * control: * mpi_data: * mat_format: storage type of sparse matrix A * b: dense vector * buf_type: data structure type for b * mpi_type: MPI_Datatype struct for communications * * returns: communication time */ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, const control_params * const control, mpi_datatypes * const mpi_data, int mat_format, void * const b, int buf_type, MPI_Datatype mpi_type ) { if ( mat_format == SYM_HALF_MATRIX ) { Coll( system, mpi_data, b, buf_type, mpi_type ); } #if defined(NEUTRAL_TERRITORY) else { Coll( system, mpi_data, b, buf_type, mpi_type ); } #endif } /* sparse matrix, dense vector multiplication AX = B * * system: * control: * data: * A: symmetric matrix, stored in CSR format * X: dense vector * n: number of entries in x * B (output): dense vector */ static void Dual_Sparse_MatVec( reax_system const * const system, control_params const * const control, simulation_data * const data, mpi_datatypes * const mpi_data, sparse_matrix const * const A, rvec2 const * const x, int n, rvec2 * const b ) { #if defined(LOG_PERFORMANCE) real time; time = Get_Time( ); #endif Sparse_MatVec_Comm_Part1( system, control, mpi_data, x, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif Dual_Sparse_MatVec_local( A, x, b, n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_spmv ); #endif Sparse_MatVec_Comm_Part2( system, control, mpi_data, A->format, b, RVEC2_PTR_TYPE, mpi_data->mpi_rvec2 ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif } /* sparse matrix, dense vector multiplication Ax = b * * system: * control: * data: * A: symmetric matrix, stored in CSR format * x: dense vector * n: number of entries in x * b (output): dense vector */ static void Sparse_MatVec( reax_system const * const system, control_params const * const control, simulation_data * const data, mpi_datatypes * const mpi_data, sparse_matrix const * const A, real const * const x, int n, real * const b ) { #if defined(LOG_PERFORMANCE) real time; time = Get_Time( ); #endif Sparse_MatVec_Comm_Part1( system, control, mpi_data, x, REAL_PTR_TYPE, MPI_DOUBLE ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif Sparse_MatVec_local( A, x, b, n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_spmv ); #endif Sparse_MatVec_Comm_Part2( system, control, mpi_data, A->format, b, REAL_PTR_TYPE, MPI_DOUBLE ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_comm ); #endif } void setup_sparse_approx_inverse( reax_system const * const system, simulation_data * const data, storage * const workspace, mpi_datatypes * const mpi_data, sparse_matrix * const A, sparse_matrix *A_spar_patt, int nprocs, real filter ) { int i, ret, bin, total, pos; int n, n_gather, s_local, s, n_local; int target_proc; int k, pj, size; int left, right, p, turn; int num_rows; int *srecv, *sdispls; int *scounts_local, *scounts; int *dspls_local, *dspls; int *bin_elements; real threshold, pivot, tmp; real *input_array; real *samplelist_local, *samplelist; real *pivotlist; real *bucketlist_local, *bucketlist; real t_start, t_comm; real total_comm; srecv = NULL; sdispls = NULL; samplelist_local = NULL; samplelist = NULL; pivotlist = NULL; input_array = NULL; bucketlist_local = NULL; bucketlist = NULL; scounts_local = NULL; scounts = NULL; dspls_local = NULL; dspls = NULL; bin_elements = NULL; t_comm = 0.0; #if defined(NEUTRAL_TERRITORY) num_rows = A->NT; fprintf( stdout,"%d %d %d\n", A->n, A->NT, A->m ); fflush( stdout ); #else num_rows = A->n; #endif if ( A_spar_patt->allocated == FALSE ) { #if defined(NEUTRAL_TERRITORY) Allocate_Matrix( A_spar_patt, A->n, A->NT, A->m, A->format ); #else Allocate_Matrix( A_spar_patt, A->n, system->local_cap, A->m, A->format ); #endif } else /*if ( (*A_spar_patt)->m < A->m )*/ { Deallocate_Matrix( A_spar_patt ); #if defined(NEUTRAL_TERRITORY) Allocate_Matrix( A_spar_patt, A->n, A->NT, A->m, A->format ); #else Allocate_Matrix( A_spar_patt, A->n, system->local_cap, A->m, A->format ); #endif } n_local = 0; for ( i = 0; i < num_rows; ++i ) { n_local += (A->end[i] - A->start[i] + 9) / 10; } s_local = (int) (12.0 * (LOG2(n_local) + LOG2(nprocs))); t_start = Get_Time( ); ret = MPI_Allreduce( &n_local, &n, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Reduce( &s_local, &s, 1, MPI_INT, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; /* count num. bin elements for each processor, uniform bin sizes */ input_array = smalloc( sizeof(real) * n_local, "setup_sparse_approx_inverse::input_array" ); scounts_local = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::scounts_local" ); scounts = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::scounts" ); bin_elements = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::bin_elements" ); dspls_local = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::displs_local" ); bucketlist_local = smalloc( sizeof(real) * n_local, "setup_sparse_approx_inverse::bucketlist_local" ); dspls = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::dspls" ); if ( nprocs > 1 ) { pivotlist = smalloc( sizeof(real) * (nprocs - 1), "setup_sparse_approx_inverse::pivotlist" ); } samplelist_local = smalloc( sizeof(real) * s_local, "setup_sparse_approx_inverse::samplelist_local" ); if ( system->my_rank == MASTER_NODE ) { samplelist = smalloc( sizeof(real) * s, "setup_sparse_approx_inverse::samplelist" ); srecv = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::srecv" ); sdispls = smalloc( sizeof(int) * nprocs, "setup_sparse_approx_inverse::sdispls" ); } n_local = 0; for ( i = 0; i < num_rows; ++i ) { for ( pj = A->start[i]; pj < A->end[i]; pj += 10 ) { input_array[n_local++] = A->val[pj]; } } for ( i = 0; i < s_local; i++) { /* samplelist_local[i] = input_array[rand( ) % n_local]; */ samplelist_local[i] = input_array[ i ]; } /* gather samples at the root process */ t_start = Get_Time( ); ret = MPI_Gather( &s_local, 1, MPI_INT, srecv, 1, MPI_INT, MASTER_NODE, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; if( system->my_rank == MASTER_NODE ) { sdispls[0] = 0; for ( i = 0; i < nprocs - 1; ++i ) { sdispls[i + 1] = sdispls[i] + srecv[i]; } } t_start = Get_Time( ); ret = MPI_Gatherv( samplelist_local, s_local, MPI_DOUBLE, samplelist, srecv, sdispls, MPI_DOUBLE, MASTER_NODE, MPI_COMM_WORLD); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; /* sort samples at the root process and select pivots */ if ( system->my_rank == MASTER_NODE ) { qsort_dbls( samplelist, s ); for ( i = 1; i < nprocs; ++i ) { pivotlist[i - 1] = samplelist[(i * s) / nprocs]; } } /* broadcast pivots */ t_start = Get_Time( ); ret = MPI_Bcast( pivotlist, nprocs - 1, MPI_DOUBLE, MASTER_NODE, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; for ( i = 0; i < nprocs; ++i ) { scounts_local[i] = 0; } for ( i = 0; i < n_local; ++i ) { pos = find_bucket( pivotlist, nprocs - 1, input_array[i] ); scounts_local[pos]++; } for ( i = 0; i < nprocs; ++i ) { bin_elements[i] = scounts_local[i]; scounts[i] = scounts_local[i]; } /* compute displacements for MPI comm */ dspls_local[0] = 0; for ( i = 0; i < nprocs - 1; ++i ) { dspls_local[i + 1] = dspls_local[i] + scounts_local[i]; } /* bin elements */ for ( i = 0; i < n_local; ++i ) { bin = find_bucket( pivotlist, nprocs - 1, input_array[i] ); pos = dspls_local[bin] + scounts_local[bin] - bin_elements[bin]; bucketlist_local[pos] = input_array[i]; bin_elements[bin]--; } /* determine counts for elements per process */ t_start = Get_Time( ); ret = MPI_Allreduce( MPI_IN_PLACE, scounts, nprocs, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; /* find the target process */ target_proc = 0; total = 0; k = n * filter; for ( i = nprocs - 1; i >= 0; --i ) { if ( total + scounts[i] >= k ) { /* global k becomes local k*/ k -= total; target_proc = i; break; } total += scounts[i]; } n_gather = scounts[target_proc]; if ( system->my_rank == target_proc ) { bucketlist = smalloc( sizeof( real ) * n_gather, "setup_sparse_approx_inverse::bucketlist" ); } /* send local buckets to target processor for quickselect */ t_start = Get_Time( ); ret = MPI_Gather( scounts_local + target_proc, 1, MPI_INT, scounts, 1, MPI_INT, target_proc, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; if ( system->my_rank == target_proc ) { dspls[0] = 0; for ( i = 0; i < nprocs - 1; ++i ) { dspls[i + 1] = dspls[i] + scounts[i]; } } t_start = Get_Time( ); ret = MPI_Gatherv( bucketlist_local + dspls_local[target_proc], scounts_local[target_proc], MPI_DOUBLE, bucketlist, scounts, dspls, MPI_DOUBLE, target_proc, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; /* apply quick select algorithm at the target process */ if ( system->my_rank == target_proc ) { left = 0; right = n_gather-1; turn = 0; while ( k ) { p = left; turn = 1 - turn; /* alternating pivots in order to handle corner cases */ if ( turn == 1 ) { pivot = bucketlist[right]; } else { pivot = bucketlist[left]; } for ( i = left + 1 - turn; i <= right-turn; ++i ) { if ( bucketlist[i] > pivot ) { tmp = bucketlist[i]; bucketlist[i] = bucketlist[p]; bucketlist[p] = tmp; p++; } } if ( turn == 1 ) { tmp = bucketlist[p]; bucketlist[p] = bucketlist[right]; bucketlist[right] = tmp; } else { tmp = bucketlist[p]; bucketlist[p] = bucketlist[left]; bucketlist[left] = tmp; } if ( p == k - 1) { threshold = bucketlist[p]; break; } else if ( p > k - 1 ) { right = p - 1; } else { left = p + 1; } } /* comment out if ACKS2 and/or EE is not an option */ // if ( threshold < 1.000000 ) // { // threshold = 1.000001; // } } /* broadcast the filtering value */ t_start = Get_Time( ); ret = MPI_Bcast( &threshold, 1, MPI_DOUBLE, target_proc, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; #if defined(DEBUG_FOCUS) int nnz = 0; #endif /* build entries of that pattern*/ for ( i = 0; i < num_rows; ++i ) { A_spar_patt->start[i] = A->start[i]; size = A->start[i]; for ( pj = A->start[i]; pj < A->end[i]; ++pj ) { if ( ( A->val[pj] >= threshold ) || ( A->j[pj] == i ) ) { A_spar_patt->val[size] = A->val[pj]; A_spar_patt->j[size] = A->j[pj]; size++; #if defined(DEBUG_FOCUS) nnz++; #endif } } A_spar_patt->end[i] = size; } #if defined(DEBUG_FOCUS) ret = MPI_Allreduce( MPI_IN_PLACE, &nnz, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); if ( system->my_rank == MASTER_NODE ) { fprintf( stdout, " [INFO] \ntotal nnz in all charge matrices = %d\ntotal nnz in all sparsity patterns = %d\nthreshold = %.15lf\n", n, nnz, threshold ); fflush( stdout ); } #endif ret = MPI_Reduce( &t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); if ( system->my_rank == MASTER_NODE ) { data->timing.cm_solver_comm += total_comm / nprocs; } sfree( input_array, "setup_sparse_approx_inverse::input_array" ); sfree( scounts_local, "setup_sparse_approx_inverse::scounts_local" ); sfree( scounts, "setup_sparse_approx_inverse::scounts" ); sfree( bin_elements, "setup_sparse_approx_inverse::bin_elements" ); sfree( dspls_local, "setup_sparse_approx_inverse::displs_local" ); sfree( bucketlist_local, "setup_sparse_approx_inverse::bucketlist_local" ); sfree( dspls, "setup_sparse_approx_inverse::dspls" ); if ( nprocs > 1 ) { sfree( pivotlist, "setup_sparse_approx_inverse::pivotlist" ); } sfree( samplelist_local, "setup_sparse_approx_inverse::samplelist_local" ); if ( system->my_rank == MASTER_NODE ) { sfree( samplelist, "setup_sparse_approx_inverse::samplelist" ); sfree( srecv, "setup_sparse_approx_inverse::srecv" ); sfree( sdispls, "setup_sparse_approx_inverse::sdispls" ); } if ( system->my_rank == target_proc ) { sfree( bucketlist, "setup_sparse_approx_inverse::bucketlist" ); } } #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) #if defined(NEUTRAL_TERRITORY) real sparse_approx_inverse( reax_system const * const system, simulation_data * const data, storage * const workspace, mpi_datatypes * const mpi_data, sparse_matrix * const A, sparse_matrix * const A_spar_patt, sparse_matrix * const A_app_inv, int nprocs ) { int i, k, pj, j_temp, ret; int N, M, d_i, d_j; int local_pos, atom_pos, identity_pos; int *pos_x, *X; int cnt; int *row_nnz; int **j_list; int d, count, index; int *j_send, *j_recv[6]; real *e_j, *dense_matrix; real *val_send, *val_recv[6]; reax_atom *atom; real **val_list; real start, t_start, t_comm, total_comm; mpi_out_data *out_bufs; neighbor_proc *nbr; MPI_Request req[12]; MPI_Status stat[12]; lapack_int m, n, nrhs, lda, ldb, info; start = Get_Time( ); t_comm = 0.0; if ( A_app_inv->allocated == FALSE ) { //TODO: FULL_MATRIX? Allocate_Matrix( A_app_inv, A_spar_patt->n, A->NT, A_spar_patt->m, SYM_FULL_MATRIX ); } else /* if ( A_app_inv->m < A_spar_patt->m ) */ { Deallocate_Matrix( A_app_inv ); Allocate_Matrix( A_app_inv, A_spar_patt->n, A->NT, A_spar_patt->m, SYM_FULL_MATRIX ); } pos_x = NULL; X = NULL; row_nnz = NULL; j_list = NULL; val_list = NULL; j_send = NULL; val_send = NULL; for ( d = 0; d < 6; ++d ) { j_recv[d] = NULL; val_recv[d] = NULL; } //////////////////// row_nnz = (int *) malloc( sizeof(int) * A->NT ); //TODO: allocation size j_list = (int **) malloc( sizeof(int *) * system->N ); val_list = (real **) malloc( sizeof(real *) * system->N ); for ( i = 0; i < A->NT; ++i ) { row_nnz[i] = 0; } /* mark the atoms that already have their row stored in the local matrix */ for ( i = 0; i < A->n; ++i ) { row_nnz[i] = A->end[i] - A->start[i]; } /* Announce the nnz's in each row that will be communicated later */ t_start = Get_Time( ); Dist( system, mpi_data, row_nnz, REAL_PTR_TYPE, MPI_INT ); t_comm += Get_Time( ) - t_start; fprintf( stdout,"SAI after Dist call\n" ); fflush( stdout ); out_bufs = mpi_data->out_nt_buffers; count = 0; /* use a Dist-like approach to send the row information */ for ( d = 0; d < 6; ++d) { /* initiate recvs */ nbr = &system->my_nt_nbrs[d]; if ( nbr->atoms_cnt ) { /* calculate the total data that will be received */ cnt = 0; for ( i = nbr->atoms_str; i < (nbr->atoms_str + nbr->atoms_cnt); ++i ) { cnt += row_nnz[i]; } /* initiate Irecv */ if ( cnt ) { count += 2; j_recv[d] = (int *) malloc( sizeof(int) * cnt ); val_recv[d] = (real *) malloc( sizeof(real) * cnt ); fprintf( stdout, "Dist communication receive phase direction %d will receive %d\n", d, cnt ); fflush( stdout ); t_start = Get_Time( ); ret = MPI_Irecv( j_recv + d, cnt, MPI_INT, nbr->receive_rank, d, mpi_data->comm_mesh3D, &req[2 * d] ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Irecv( val_recv + d, cnt, MPI_DOUBLE, nbr->receive_rank, d, mpi_data->comm_mesh3D, &req[2 * d + 1] ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; } } } ///////////////////// for( d = 0; d < 6; ++d) { nbr = &system->my_nt_nbrs[d]; /* send both messages in dimension d */ if ( out_bufs[d].cnt ) { cnt = 0; for ( i = 0; i < out_bufs[d].cnt; ++i ) { cnt += A->end[ out_bufs[d].index[i] ] - A->start[ out_bufs[d].index[i] ]; if ( out_bufs[d].index[i] < 0 || out_bufs[d].index[i] >= A->n ) { fprintf( stdout, "INDEXING ERROR %d > %d\n", out_bufs[d].index[i], A->n ); fflush( stdout ); } // row_nnz[ out_bufs[d].index[i] ]; } fprintf( stdout,"Dist communication send phase direction %d should send %d\n", d, cnt ); fflush( stdout ); if ( cnt ) { j_send = (int *) malloc( sizeof(int) * cnt ); val_send = (real *) malloc( sizeof(real) * cnt ); cnt = 0; for ( i = 0; i < out_bufs[d].cnt; ++i ) { for ( pj = A->start[ out_bufs[d].index[i] ]; pj < A->end[ out_bufs[d].index[i] ]; ++pj ) { atom = &system->my_atoms[ A->j[pj] ]; j_send[cnt] = atom->orig_id; val_send[cnt] = A->val[pj]; cnt++; } } fprintf( stdout,"Dist communication send phase direction %d will send %d\n", d, cnt ); fflush( stdout ); t_start = Get_Time( ); ret = MPI_Send( j_send, cnt, MPI_INT, nbr->rank, d, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt ); fflush( stdout ); ret = MPI_Send( val_send, cnt, MPI_DOUBLE, nbr->rank, d, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); fprintf( stdout,"Dist communication send phase direction %d cnt = %d\n", d, cnt ); fflush( stdout ); t_comm += Get_Time( ) - t_start; } } } fprintf( stdout," Dist communication for sending row info before waitany\n" ); fflush( stdout ); /////////////////////// for ( d = 0; d < count; ++d ) { t_start = Get_Time( ); ret = MPI_Waitany( MAX_NT_NBRS, req, &index, stat ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; nbr = &system->my_nt_nbrs[index / 2]; cnt = 0; for ( i = nbr->atoms_str; i < (nbr->atoms_str + nbr->atoms_cnt); ++i ) { if ( index % 2 == 0 ) { j_list[i] = (int *) malloc( sizeof(int) * row_nnz[i] ); for ( pj = 0; pj < row_nnz[i]; ++pj ) { j_list[i][pj] = j_recv[index / 2][cnt]; cnt++; } } else { val_list[i] = (real *) malloc( sizeof(real) * row_nnz[i] ); for ( pj = 0; pj < row_nnz[i]; ++pj ) { val_list[i][pj] = val_recv[index / 2][cnt]; cnt++; } } } } ////////////////////// fprintf( stdout, "Dist communication for sending row info worked\n" ); fflush( stdout ); //TODO: size? X = (int *) malloc( sizeof(int) * (system->bigN + 1) ); pos_x = (int *) malloc( sizeof(int) * (system->bigN + 1) ); for ( i = 0; i < A_spar_patt->NT; ++i ) { N = 0; M = 0; for ( k = 0; k <= system->bigN; ++k ) { X[k] = 0; pos_x[k] = 0; } /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */ for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj ) { j_temp = A_spar_patt->j[pj]; atom = &system->my_atoms[j_temp]; ++N; /* for each of those indices * search through the row of full A of that index */ /* the case where the local matrix has that index's row */ if ( j_temp < A->NT ) { for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k ) { /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */ atom = &system->my_atoms[ A->j[k] ]; X[atom->orig_id] = 1; } } /* the case where we communicated that index's row */ else { for ( k = 0; k < row_nnz[j_temp]; ++k ) { /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */ X[ j_list[j_temp][k] ] = 1; } } } /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */ identity_pos = M; atom = &system->my_atoms[ i ]; atom_pos = atom->orig_id; for ( k = 0; k <= system->bigN; k++) { if ( X[k] != 0 ) { pos_x[k] = M; if ( k == atom_pos ) { identity_pos = M; } ++M; } } /* allocate memory for NxM dense matrix */ dense_matrix = (real *) malloc( sizeof(real) * N * M ); /* fill in the entries of dense matrix */ for ( d_j = 0; d_j < N; ++d_j) { /* all rows are initialized to zero */ for ( d_i = 0; d_i < M; ++d_i ) { dense_matrix[d_i * N + d_j] = 0.0; } /* change the value if any of the column indices is seen */ /* it is in the original list */ local_pos = A_spar_patt->j[ A_spar_patt->start[i] + d_j ]; if ( local_pos < 0 || local_pos >= system->N ) { fprintf( stderr, "THE LOCAL POSITION OF THE ATOM IS NOT VALID, STOP THE EXECUTION\n" ); fflush( stderr ); } ///////////////////////////// if ( local_pos < A->NT ) { for ( d_i = A->start[local_pos]; d_i < A->end[local_pos]; ++d_i ) { atom = &system->my_atoms[ A->j[d_i] ]; if ( pos_x[ atom->orig_id ] >= M || d_j >= N ) { fprintf( stderr, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, orig_id = %d, i = %d, j = %d, M = %d N = %d\n", atom->orig_id, pos_x[ atom->orig_id ], d_j, M, N ); fflush( stderr ); } if ( X[ atom->orig_id ] == 1 ) { dense_matrix[ pos_x[ atom->orig_id ] * N + d_j ] = A->val[d_i]; } } } else { for ( d_i = 0; d_i < row_nnz[ local_pos ]; ++d_i ) { if ( pos_x[ j_list[local_pos][d_i] ] >= M || d_j >= N ) { fprintf( stderr, "CANNOT MAP IT TO THE DENSE MATRIX, STOP THE EXECUTION, %d %d\n", pos_x[ j_list[local_pos][d_i] ], d_j); fflush( stderr ); } if ( X[ j_list[local_pos][d_i] ] == 1 ) { dense_matrix[ pos_x[ j_list[local_pos][d_i] ] * N + d_j ] = val_list[local_pos][d_i]; } } } } /* create the right hand side of the linear equation * that is the full column of the identity matrix */ e_j = (real *) malloc( sizeof(real) * M ); ////////////////////// for ( k = 0; k < M; ++k ) { e_j[k] = 0.0; } e_j[identity_pos] = 1.0; /* Solve the overdetermined system AX = B through the least-squares problem: * min ||B - AX||_2 */ m = M; n = N; nrhs = 1; lda = N; ldb = nrhs; info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda, e_j, ldb ); /* Check for the full rank */ if ( info > 0 ) { fprintf( stderr, "The diagonal element %i of the triangular factor ", info ); fprintf( stderr, "of A is zero, so that A does not have full rank;\n" ); fprintf( stderr, "the least squares solution could not be computed.\n" ); exit( INVALID_INPUT ); } /* accumulate the resulting vector to build A_app_inv */ A_app_inv->start[i] = A_spar_patt->start[i]; A_app_inv->end[i] = A_spar_patt->end[i]; for ( k = A_app_inv->start[i]; k < A_app_inv->end[i]; ++k) { A_app_inv->j[k] = A_spar_patt->j[k]; A_app_inv->val[k] = e_j[k - A_spar_patt->start[i]]; } free( dense_matrix ); free( e_j ); } free( pos_x); free( X ); ///////////////////// ret = MPI_Reduce( &t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); if ( system->my_rank == MASTER_NODE ) { data->timing.cm_solver_comm += total_comm / nprocs; } return Get_Time( ) - start; } #else real sparse_approx_inverse( reax_system const * const system, simulation_data * const data, storage * const workspace, mpi_datatypes * const mpi_data, sparse_matrix * const A, sparse_matrix * const A_spar_patt, sparse_matrix * const A_app_inv, int nprocs ) { int i, k, pj, j_temp, push, ret; int N, M, d_i, d_j, mark; int local_pos, atom_pos, identity_pos; int *X, *q; int size_e, size_dense; int cnt; int *row_nnz; int **j_list; int d; int flag1, flag2; int *j_send, *j_recv1, *j_recv2; int size_send, size_recv1, size_recv2; real *e_j, *dense_matrix; real **val_list; real *val_send, *val_recv1, *val_recv2; real start, t_start, t_comm, total_comm; reax_atom *atom; mpi_out_data *out_bufs; const neighbor_proc *nbr1, *nbr2; MPI_Request req1, req2, req3, req4; MPI_Status stat1, stat2, stat3, stat4; lapack_int m, n, nrhs, lda, ldb, info; start = Get_Time( ); t_comm = 0.0; if ( A_app_inv->allocated == FALSE ) { Allocate_Matrix( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m, SYM_FULL_MATRIX ); } else /* if ( A_app_inv->m < A_spar_patt->m ) */ { Deallocate_Matrix( A_app_inv ); Allocate_Matrix( A_app_inv, A_spar_patt->n, system->local_cap, A_spar_patt->m, SYM_FULL_MATRIX ); } X = NULL; j_send = NULL; val_send = NULL; j_recv1 = NULL; j_recv2 = NULL; val_recv1 = NULL; val_recv2 = NULL; size_send = 0; size_recv1 = 0; size_recv2 = 0; e_j = NULL; dense_matrix = NULL; size_e = 0; size_dense = 0; row_nnz = smalloc( sizeof(int) * system->total_cap, "sparse_approx_inverse::row_nnz" ); j_list = smalloc( sizeof(int *) * system->N, "sparse_approx_inverse::j_list" ); val_list = smalloc( sizeof(real *) * system->N, "sparse_approx_inverse::val_list" ); for ( i = 0; i < system->total_cap; ++i ) { row_nnz[i] = 0; } /* mark the atoms that already have their row stored in the local matrix */ for ( i = 0; i < system->n; ++i ) { row_nnz[i] = A->end[i] - A->start[i]; } /* Announce the nnz's in each row that will be communicated later */ t_start = Get_Time( ); Dist( system, mpi_data, row_nnz, INT_PTR_TYPE, MPI_INT ); t_comm += Get_Time( ) - t_start; out_bufs = mpi_data->out_buffers; /* use a Dist-like approach to send the row information */ for ( d = 0; d < 3; ++d) { flag1 = 0; flag2 = 0; cnt = 0; /* initiate recvs */ nbr1 = &system->my_nbrs[2 * d]; if ( nbr1->atoms_cnt ) { cnt = 0; /* calculate the total data that will be received */ for( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i ) { cnt += row_nnz[i]; } /* initiate Irecv */ if( cnt ) { flag1 = 1; if ( size_recv1 < cnt ) { if ( size_recv1 ) { sfree( j_recv1, "sparse_approx_inverse::j_recv1" ); sfree( val_recv1, "sparse_approx_inverse::val_recv1" ); } size_recv1 = cnt * SAFE_ZONE; j_recv1 = smalloc( sizeof(int) * size_recv1, "sparse_approx_inverse::j_recv1" ); val_recv1 = smalloc( sizeof(real) * size_recv1, "sparse_approx_inverse::val_recv1" ); } t_start = Get_Time( ); ret = MPI_Irecv( j_recv1, cnt, MPI_INT, nbr1->rank, 2 * d + 1, mpi_data->comm_mesh3D, &req1 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Irecv( val_recv1, cnt, MPI_DOUBLE, nbr1->rank, 2 * d + 1, mpi_data->comm_mesh3D, &req2 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; } } nbr2 = &system->my_nbrs[2 * d + 1]; if ( nbr2->atoms_cnt ) { /* calculate the total data that will be received */ cnt = 0; for( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i ) { cnt += row_nnz[i]; } /* initiate Irecv */ if( cnt ) { flag2 = 1; if ( size_recv2 < cnt ) { if ( size_recv2 ) { sfree( j_recv2, "sparse_approx_inverse::j_recv2" ); sfree( val_recv2, "sparse_approx_inverse::val_recv2" ); } size_recv2 = cnt * SAFE_ZONE; j_recv2 = smalloc( sizeof(int) * size_recv2, "sparse_approx_inverse::j_recv2" ); val_recv2 = smalloc( sizeof(real) * size_recv2, "sparse_approx_inverse::val_recv2" ); } t_start = Get_Time( ); ret = MPI_Irecv( j_recv2, cnt, MPI_INT, nbr2->rank, 2 * d, mpi_data->comm_mesh3D, &req3 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Irecv( val_recv2, cnt, MPI_DOUBLE, nbr2->rank, 2 * d, mpi_data->comm_mesh3D, &req4 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; } } /* send both messages in dimension d */ if ( out_bufs[2 * d].cnt ) { cnt = 0; for ( i = 0; i < out_bufs[2 * d].cnt; ++i ) { cnt += row_nnz[ out_bufs[2 * d].index[i] ]; } if ( cnt > 0 ) { if ( size_send < cnt ) { if ( size_send ) { sfree( j_send, "sparse_approx_inverse::j_send" ); sfree( val_send, "sparse_approx_inverse::val_send" ); } size_send = cnt * SAFE_ZONE; j_send = smalloc( sizeof(int) * size_send, "sparse_approx_inverse::j_send" ); val_send = smalloc( sizeof(real) * size_send, "sparse_approx_inverse::j_send" ); } cnt = 0; for ( i = 0; i < out_bufs[2 * d].cnt; ++i ) { if ( out_bufs[2 * d].index[i] < A->n ) { for ( pj = A->start[ out_bufs[2 * d].index[i] ]; pj < A->end[ out_bufs[2 * d].index[i] ]; ++pj ) { atom = &system->my_atoms[ A->j[pj] ]; j_send[cnt] = atom->orig_id; val_send[cnt] = A->val[pj]; cnt++; } } else { for ( pj = 0; pj < row_nnz[ out_bufs[2 * d].index[i] ]; ++pj ) { j_send[cnt] = j_list[ out_bufs[2 * d].index[i] ][pj]; val_send[cnt] = val_list[ out_bufs[2 * d].index[i] ][pj]; cnt++; } } } t_start = Get_Time( ); ret = MPI_Send( j_send, cnt, MPI_INT, nbr1->rank, 2 * d, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Send( val_send, cnt, MPI_DOUBLE, nbr1->rank, 2 * d, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; } } if ( out_bufs[2 * d + 1].cnt ) { cnt = 0; for ( i = 0; i < out_bufs[2 * d + 1].cnt; ++i ) { cnt += row_nnz[ out_bufs[2 * d + 1].index[i] ]; } if ( cnt > 0 ) { if ( size_send < cnt ) { if ( size_send ) { sfree( j_send, "sparse_approx_inverse::j_send" ); sfree( val_send, "sparse_approx_inverse::j_send" ); } size_send = cnt * SAFE_ZONE; j_send = smalloc( sizeof(int) * size_send, "sparse_approx_inverse::j_send" ); val_send = smalloc( sizeof(real) * size_send, "sparse_approx_inverse::val_send" ); } cnt = 0; for ( i = 0; i < out_bufs[2 * d + 1].cnt; ++i ) { if ( out_bufs[2 * d + 1].index[i] < A->n ) { for ( pj = A->start[ out_bufs[2 * d + 1].index[i] ]; pj < A->end[ out_bufs[2 * d + 1].index[i] ]; ++pj ) { atom = &system->my_atoms[ A->j[pj] ]; j_send[cnt] = atom->orig_id; val_send[cnt] = A->val[pj]; cnt++; } } else { for ( pj = 0; pj < row_nnz[ out_bufs[2 * d + 1].index[i] ]; ++pj ) { j_send[cnt] = j_list[ out_bufs[2 * d + 1].index[i] ][pj]; val_send[cnt] = val_list[ out_bufs[2 * d + 1].index[i] ][pj]; cnt++; } } } t_start = Get_Time( ); ret = MPI_Send( j_send, cnt, MPI_INT, nbr2->rank, 2 * d + 1, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Send( val_send, cnt, MPI_DOUBLE, nbr2->rank, 2 * d + 1, mpi_data->comm_mesh3D ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; } } if ( flag1 ) { t_start = Get_Time( ); ret = MPI_Wait( &req1, &stat1 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Wait( &req2, &stat2 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; cnt = 0; for ( i = nbr1->atoms_str; i < (nbr1->atoms_str + nbr1->atoms_cnt); ++i ) { j_list[i] = smalloc( sizeof(int) * row_nnz[i], "sparse_approx_inverse::j_list[i]" ); val_list[i] = smalloc( sizeof(real) * row_nnz[i], "sparse_approx_inverse::val_list[i]" ); for ( pj = 0; pj < row_nnz[i]; ++pj ) { j_list[i][pj] = j_recv1[cnt]; val_list[i][pj] = val_recv1[cnt]; cnt++; } } } if ( flag2 ) { t_start = Get_Time( ); ret = MPI_Wait( &req3, &stat3 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); ret = MPI_Wait( &req4, &stat4 ); Check_MPI_Error( ret, __FILE__, __LINE__ ); t_comm += Get_Time( ) - t_start; cnt = 0; for ( i = nbr2->atoms_str; i < (nbr2->atoms_str + nbr2->atoms_cnt); ++i ) { j_list[i] = smalloc( sizeof(int) * row_nnz[i], "sparse_approx_inverse::j_list[i]" ); val_list[i] = smalloc( sizeof(real) * row_nnz[i], "sparse_approx_inverse::val_list[i]" ); for ( pj = 0; pj < row_nnz[i]; ++pj ) { j_list[i][pj] = j_recv2[cnt]; val_list[i][pj] = val_recv2[cnt]; cnt++; } } } } sfree( j_send, "sparse_approx_inverse::j_send" ); sfree( val_send, "sparse_approx_inverse::val_send" ); sfree( j_recv1, "sparse_approx_inverse::j_recv1" ); sfree( j_recv2, "sparse_approx_inverse::j_recv2" ); sfree( val_recv1, "sparse_approx_inverse::val_recv1" ); sfree( val_recv2, "sparse_approx_inverse::val_recv2" ); X = smalloc( sizeof(int) * (system->bigN + 1), "sparse_approx_inverse::X" ); //size of q should be equal to the maximum possible cardinalty //of the set formed by neighbors of neighbors of an atom //i.e, maximum number of rows of dense matrix //for water systems, this number is 34000 //for silica systems, it is 12000 q = smalloc( sizeof(int) * 50000, "sparse_approx_inverse::q" ); for ( i = 0; i <= system->bigN; ++i ) { X[i] = -1; } for ( i = 0; i < A_spar_patt->n; ++i ) { N = 0; M = 0; push = 0; mark = i + system->bigN; /* find column indices of nonzeros (which will be the columns indices of the dense matrix) */ for ( pj = A_spar_patt->start[i]; pj < A_spar_patt->end[i]; ++pj ) { j_temp = A_spar_patt->j[pj]; atom = &system->my_atoms[j_temp]; ++N; /* for each of those indices * search through the row of full A of that index */ /* the case where the local matrix has that index's row */ if( j_temp < A->n ) { for ( k = A->start[ j_temp ]; k < A->end[ j_temp ]; ++k ) { /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */ atom = &system->my_atoms[ A->j[k] ]; X[atom->orig_id] = mark; q[push++] = atom->orig_id; } } /* the case where we communicated that index's row */ else { for ( k = 0; k < row_nnz[j_temp]; ++k ) { /* and accumulate the nonzero column indices to serve as the row indices of the dense matrix */ X[ j_list[j_temp][k] ] = mark; q[push++] = j_list[j_temp][k]; } } } /* enumerate the row indices from 0 to (# of nonzero rows - 1) for the dense matrix */ identity_pos = M; atom = &system->my_atoms[ i ]; atom_pos = atom->orig_id; for ( k = 0; k < push; k++) { if ( X[ q[k] ] == mark ) { X[ q[k] ] = M; ++M; } } identity_pos = X[atom_pos]; /* allocate memory for NxM dense matrix */ if ( size_dense < N * M ) { if ( size_dense ) { sfree( dense_matrix, "sparse_approx_inverse::dense_matrix" ); } size_dense = N * M * SAFE_ZONE; dense_matrix = smalloc( sizeof(real) * size_dense, "sparse_approx_inverse::dense_matrix" ); } /* fill in the entries of dense matrix */ for ( d_j = 0; d_j < N; ++d_j) { /* all rows are initialized to zero */ for ( d_i = 0; d_i < M; ++d_i ) { dense_matrix[d_i * N + d_j] = 0.0; } /* change the value if any of the column indices is seen */ /* it is in the original list */ local_pos = A_spar_patt->j[ A_spar_patt->start[i] + d_j ]; if ( local_pos < A->n ) { for ( d_i = A->start[local_pos]; d_i < A->end[local_pos]; ++d_i ) { atom = &system->my_atoms[ A->j[d_i] ]; dense_matrix[ X[ atom->orig_id ] * N + d_j ] = A->val[d_i]; } } else { for ( d_i = 0; d_i < row_nnz[ local_pos ]; ++d_i ) { dense_matrix[ X[ j_list[local_pos][d_i] ] * N + d_j ] = val_list[local_pos][d_i]; } } } /* create the right hand side of the linear equation * that is the full column of the identity matrix */ if ( size_e < M ) { if ( size_e ) { sfree( e_j, "sparse_approx_inverse::e_j" ); } size_e = M * SAFE_ZONE; e_j = smalloc( sizeof(real) * size_e, "sparse_approx_inverse::e_j" ); } for ( k = 0; k < M; ++k ) { e_j[k] = 0.0; } e_j[identity_pos] = 1.0; /* Solve the overdetermined system AX = B through the least-squares problem: * min ||B - AX||_2 */ m = M; n = N; nrhs = 1; lda = N; ldb = nrhs; info = LAPACKE_dgels( LAPACK_ROW_MAJOR, 'N', m, n, nrhs, dense_matrix, lda, e_j, ldb ); /* Check for the full rank */ if ( info > 0 ) { fprintf( stderr, "[ERROR] The diagonal element %i of the triangular factor ", info ); fprintf( stderr, "of A is zero, so that A does not have full rank;\n" ); fprintf( stderr, "the least squares solution could not be computed.\n" ); MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); } /* accumulate the resulting vector to build A_app_inv */ A_app_inv->start[i] = A_spar_patt->start[i]; A_app_inv->end[i] = A_spar_patt->end[i]; for ( k = A_app_inv->start[i]; k < A_app_inv->end[i]; ++k) { A_app_inv->j[k] = A_spar_patt->j[k]; A_app_inv->val[k] = e_j[k - A_spar_patt->start[i]]; } } sfree( dense_matrix, "sparse_approx_inverse::dense_matrix" ); sfree( e_j, "sparse_approx_inverse::e_j" ); sfree( X, "sparse_approx_inverse::X" ); /*for ( i = 0; i < system->N; ++i ) { sfree( j_list[i], "sparse_approx_inverse::j_list" ); sfree( val_list[i], "sparse_approx_inverse::val_list" ); } sfree( j_list, "sparse_approx_inverse::j_list" ); sfree( val_list, "sparse_approx_inverse::val_list" );*/ sfree( row_nnz, "sparse_approx_inverse::row_nnz" ); ret = MPI_Reduce( &t_comm, &total_comm, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); if ( system->my_rank == MASTER_NODE ) { data->timing.cm_solver_comm += total_comm / nprocs; } return Get_Time( ) - start; } #endif #endif /* Apply left-sided preconditioning while solving M^{-1}AX = M^{-1}B * * system: * workspace: data struct containing matrices and vectors, stored in CSR * control: data struct containing parameters * data: struct containing timing simulation data (including performance data) * y: vector to which to apply preconditioning, * specific to internals of iterative solver being used * x (output): preconditioned vector * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner * side: used in determining how to apply preconditioner if the preconditioner is * factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU) * * Assumptions: * Matrices have non-zero diagonals * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ static void dual_apply_preconditioner( reax_system const * const system, storage const * const workspace, control_params const * const control, simulation_data * const data, mpi_datatypes * const mpi_data, rvec2 const * const y, rvec2 * const x, int fresh_pre, int side ) { // int i, si; /* no preconditioning */ if ( control->cm_solver_pre_comp_type == NONE_PC ) { if ( x != y ) { Vector_Copy_rvec2( x, y, system->n ); } } else { switch ( side ) { case LEFT: switch ( control->cm_solver_pre_app_type ) { case TRI_SOLVE_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: dual_jacobi_app( workspace->Hdia_inv, y, x, system->n ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); // break; case SAI_PC: #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, H->NT, x ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, system->n, x ); #endif break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_LEVEL_SCHED_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: dual_jacobi_app( workspace->Hdia_inv, y, x, system->n ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve_level_sched( (static_storage *) workspace, // workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); // break; case SAI_PC: #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, H->NT, x ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, system->n, x ); #endif break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_GC_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // for ( i = 0; i < workspace->H->n; ++i ) // { // workspace->y_p[i] = y[i]; // } // // permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); // tri_solve_level_sched( (static_storage *) workspace, // workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case JACOBI_ITER_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // // construct D^{-1}_L // if ( fresh_pre == TRUE ) // { // for ( i = 0; i < workspace->L->n; ++i ) // { // si = workspace->L->start[i + 1] - 1; // workspace->Dinv_L[i] = 1.0 / workspace->L->val[si]; // } // } // // jacobi_iter( workspace, workspace->L, workspace->Dinv_L, // y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case RIGHT: switch ( control->cm_solver_pre_app_type ) { case TRI_SOLVE_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: if ( x != y ) { Vector_Copy_rvec2( x, y, system->n ); } break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve( workspace->U, y, x, workspace->U->n, UPPER ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_LEVEL_SCHED_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: if ( x != y ) { Vector_Copy_rvec2( x, y, system->n ); } break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve_level_sched( (static_storage *) workspace, // workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_GC_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve_level_sched( (static_storage *) workspace, // workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); // permute_vector( workspace, x, workspace->H->n, TRUE, UPPER ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case JACOBI_ITER_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // if ( fresh_pre == TRUE ) // { // for ( i = 0; i < workspace->U->n; ++i ) // { // si = workspace->U->start[i]; // workspace->Dinv_U[i] = 1.0 / workspace->U->val[si]; // } // } // // jacobi_iter( workspace, workspace->U, workspace->Dinv_U, // y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; } } } /* Apply left-sided preconditioning while solving M^{-1}Ax = M^{-1}b * * system: * workspace: data struct containing matrices and vectors, stored in CSR * control: data struct containing parameters * data: struct containing timing simulation data (including performance data) * y: vector to which to apply preconditioning, * specific to internals of iterative solver being used * x (output): preconditioned vector * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner * side: used in determining how to apply preconditioner if the preconditioner is * factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU) * * Assumptions: * Matrices have non-zero diagonals * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ static void apply_preconditioner( reax_system const * const system, storage const * const workspace, control_params const * const control, simulation_data * const data, mpi_datatypes * const mpi_data, real const * const y, real * const x, int fresh_pre, int side ) { // int i, si; /* no preconditioning */ if ( control->cm_solver_pre_comp_type == NONE_PC ) { if ( x != y ) { Vector_Copy( x, y, system->n ); } } else { switch ( side ) { case LEFT: switch ( control->cm_solver_pre_app_type ) { case TRI_SOLVE_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: jacobi_app( workspace->Hdia_inv, y, x, system->n ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); // break; case SAI_PC: #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, H->NT, x ); #else Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, system->n, x ); #endif break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_LEVEL_SCHED_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: jacobi_app( workspace->Hdia_inv, y, x, system->n ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve_level_sched( (static_storage *) workspace, // workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); // break; case SAI_PC: #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, H->NT, x ); #else Sparse_MatVec( system, control, data, mpi_data, &workspace->H_app_inv, y, system->n, x ); #endif break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_GC_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // for ( i = 0; i < workspace->H->n; ++i ) // { // workspace->y_p[i] = y[i]; // } // // permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); // tri_solve_level_sched( (static_storage *) workspace, // workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case JACOBI_ITER_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // // construct D^{-1}_L // if ( fresh_pre == TRUE ) // { // for ( i = 0; i < workspace->L->n; ++i ) // { // si = workspace->L->start[i + 1] - 1; // workspace->Dinv_L[i] = 1.0 / workspace->L->val[si]; // } // } // // jacobi_iter( workspace, workspace->L, workspace->Dinv_L, // y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case RIGHT: switch ( control->cm_solver_pre_app_type ) { case TRI_SOLVE_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: if ( x != y ) { Vector_Copy( x, y, system->n ); } break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve( workspace->U, y, x, workspace->U->n, UPPER ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_LEVEL_SCHED_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: if ( x != y ) { Vector_Copy( x, y, system->n ); } break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve_level_sched( (static_storage *) workspace, // workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case TRI_SOLVE_GC_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // tri_solve_level_sched( (static_storage *) workspace, // workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); // permute_vector( workspace, x, workspace->H->n, TRUE, UPPER ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; case JACOBI_ITER_PA: switch ( control->cm_solver_pre_comp_type ) { case JACOBI_PC: case SAI_PC: fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); exit( INVALID_INPUT ); break; // case ICHOLT_PC: // case ILUT_PC: // case ILUTP_PC: // if ( fresh_pre == TRUE ) // { // for ( i = 0; i < workspace->U->n; ++i ) // { // si = workspace->U->start[i]; // workspace->Dinv_U[i] = 1.0 / workspace->U->val[si]; // } // } // // jacobi_iter( workspace, workspace->U, workspace->Dinv_U, // y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); // break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; default: fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); exit( INVALID_INPUT ); break; } break; } } } /* Steepest Descent * This function performs dual iteration for QEq (2 simultaneous solves) * */ int dual_SDM( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, rvec2 * const b, real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; rvec2 tmp, alpha, bnorm, sig; real redux[4]; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->q2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->q2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->q2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, workspace->q2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, workspace->d2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[2], &redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); bnorm[0] = SQRT( redux[0] ); bnorm[1] = SQRT( redux[1] ); sig[0] = redux[2]; sig[1] = redux[3]; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( SQRT(sig[0]) / bnorm[0] <= tol || SQRT(sig[1]) / bnorm[1] <= tol ) { break; } #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, H->NT, workspace->q2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, system->N, workspace->q2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->d2, workspace->q2, system->n, &redux[2], &redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif sig[0] = redux[0]; sig[1] = redux[1]; tmp[0] = redux[2]; tmp[1] = redux[3]; alpha[0] = sig[0] / tmp[0]; alpha[1] = sig[1] / tmp[1]; Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n ); Vector_Add_rvec2( workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, workspace->q2, FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, workspace->d2, FALSE, RIGHT ); } /* continue to solve the system that has not converged yet */ if ( sig[0] / bnorm[0] > tol ) { Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); i += SDM( system, control, data, workspace, H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); } else if ( sig[1] / bnorm[1] > tol ) { Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); i += SDM( system, control, data, workspace, H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", i ); fprintf( stderr, " [INFO] Rel. residual error (s solve): %f\n", SQRT(sig[0]) / bnorm[0] ); fprintf( stderr, " [INFO] Rel. residual error (t solve): %f\n", SQRT(sig[1]) / bnorm[1] ); return i; } return i; } /* Steepest Descent */ int SDM( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, real * const b, real tol, real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; real tmp, alpha, bnorm, sig; real redux[2]; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->q ); #else Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->q ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->q, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, workspace->q, fresh_pre, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, workspace->d, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( b, b, system->n ); redux[1] = Dot_local( workspace->r, workspace->d, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); bnorm = SQRT( redux[0] ); sig = redux[1]; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters && SQRT(sig) / bnorm > tol; ++i ) { #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, H->NT, workspace->q ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, system->N, workspace->q ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->r, workspace->d, system->n ); redux[1] = Dot_local( workspace->d, workspace->q, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif sig = redux[0]; tmp = redux[1]; alpha = sig / tmp; Vector_Add( x, alpha, workspace->d, system->n ); Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, workspace->q, FALSE, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, workspace->d, FALSE, RIGHT ); } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] SDM convergence failed (%d iters)\n", i ); fprintf( stderr, " [INFO] Rel. residual error: %f\n", SQRT(sig) / bnorm ); return i; } return i; } /* Dual iteration of the Preconditioned Conjugate Gradient Method * for QEq (2 simultaneous solves) */ int dual_CG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, rvec2 * const b, real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; rvec2 tmp, alpha, beta, r_norm, b_norm, sig_old, sig_new; real redux[6]; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->q2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->q2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->q2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, workspace->q2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, workspace->d2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif for ( j = 0; j < 6; ++j ) { redux[j] = 0.0; } Dot_local_rvec2( workspace->r2, workspace->d2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->d2, workspace->d2, system->n, &redux[2], &redux[3] ); Dot_local_rvec2( b, b, system->n, &redux[4], &redux[5] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); sig_new[0] = redux[0]; sig_new[1] = redux[1]; r_norm[0] = SQRT( redux[2] ); r_norm[1] = SQRT( redux[3] ); b_norm[0] = SQRT( redux[4] ); b_norm[1] = SQRT( redux[5] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol ) { break; } #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, H->NT, workspace->q2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, system->N, workspace->q2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = 0.0; redux[1] = 0.0; Dot_local_rvec2( workspace->d2, workspace->q2, system->n, &redux[0], &redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( &redux, &tmp, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif alpha[0] = sig_new[0] / tmp[0]; alpha[1] = sig_new[1] / tmp[1]; Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n ); Vector_Add_rvec2( workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, workspace->q2, FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, workspace->p2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = 0.0; redux[1] = 0.0; redux[2] = 0.0; redux[3] = 0.0; Dot_local_rvec2( workspace->r2, workspace->p2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->p2, workspace->p2, system->n, &redux[2], &redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif sig_old[0] = sig_new[0]; sig_old[1] = sig_new[1]; sig_new[0] = redux[0]; sig_new[1] = redux[1]; r_norm[0] = SQRT( redux[2] ); r_norm[1] = SQRT( redux[3] ); beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; Vector_Sum_rvec2( workspace->d2, 1.0, 1.0, workspace->p2, beta[0], beta[1], workspace->d2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } /* continue to solve the system that has not converged yet */ if ( r_norm[0] / b_norm[0] > tol ) { Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); i += CG( system, control, data, workspace, H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); } else if ( r_norm[1] / b_norm[1] > tol ) { Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); i += CG( system, control, data, workspace, H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] CG convergence failed!\n" ); return i; } return i; } /* Preconditioned Conjugate Gradient Method */ int CG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, real * const b, real tol, real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; real tmp, alpha, beta, r_norm, b_norm; real sig_old, sig_new; real redux[3]; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->q ); #else Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->q ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->q, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, workspace->q, fresh_pre, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, workspace->d, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->r, workspace->d, system->n ); redux[1] = Dot_local( workspace->d, workspace->d, system->n ); redux[2] = Dot_local( b, b, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); sig_new = redux[0]; r_norm = SQRT( redux[1] ); b_norm = SQRT( redux[2] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, H->NT, workspace->q ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, system->N, workspace->q ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif tmp = Parallel_Dot( workspace->d, workspace->q, system->n, MPI_COMM_WORLD ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif alpha = sig_new / tmp; Vector_Add( x, alpha, workspace->d, system->n ); Vector_Add( workspace->r, -1.0 * alpha, workspace->q, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, workspace->q, FALSE, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, workspace->p, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->r, workspace->p, system->n ); redux[1] = Dot_local( workspace->p, workspace->p, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif sig_old = sig_new; sig_new = redux[0]; r_norm = SQRT( redux[1] ); beta = sig_new / sig_old; Vector_Sum( workspace->d, 1.0, workspace->p, beta, workspace->d, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] CG convergence failed (%d iters)\n", i ); fprintf( stderr, " [INFO] Rel. residual error: %e\n", r_norm / b_norm ); return i; } return i; } /* Bi-conjugate gradient stabalized method with left preconditioning for * solving nonsymmetric linear systems. * This function performs dual iteration for QEq (2 simultaneous solves) * * system: * workspace: struct containing storage for workspace for the linear solver * control: struct containing parameters governing the simulation and numeric methods * data: struct containing simulation data (e.g., atom info) * H: sparse, symmetric matrix, lower half stored in CSR format * b: right-hand side of the linear system * tol: tolerence compared against the relative residual for determining convergence * x: inital guess * mpi_data: * * Reference: Netlib (in MATLAB) * http://www.netlib.org/templates/matlab/bicgstab.m * */ int dual_BiCGStab( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, rvec2 * const b, real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; rvec2 tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm; real time, redux[4]; #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->d2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->d2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->d2, system->n ); Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[2], &redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif b_norm[0] = SQRT( redux[0] ); b_norm[1] = SQRT( redux[1] ); r_norm[0] = SQRT( redux[2] ); r_norm[1] = SQRT( redux[3] ); if ( b_norm[0] == 0.0 ) { b_norm[0] = 1.0; } if ( b_norm[1] == 0.0 ) { b_norm[1] = 1.0; } Vector_Copy_rvec2( workspace->r_hat2, workspace->r2, system->n ); omega[0] = 1.0; omega[1] = 1.0; rho[0] = 1.0; rho[1] = 1.0; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol ) { break; } Dot_local_rvec2( workspace->r_hat2, workspace->r2, system->n, &redux[0], &redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif rho[0] = redux[0]; rho[1] = redux[1]; if ( rho[0] == 0.0 || rho[1] == 0.0 ) { break; } if ( i > 0 ) { beta[0] = (rho[0] / rho_old[0]) * (alpha[0] / omega[0]); beta[1] = (rho[1] / rho_old[1]) * (alpha[1] / omega[1]); Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->p2, -1.0 * omega[0], -1.0 * omega[1], workspace->z2, system->n ); Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->r2, beta[0], beta[1], workspace->q2, system->n ); } else { Vector_Copy_rvec2( workspace->p2, workspace->r2, system->n ); } #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->p2, workspace->y2, i == 0 ? fresh_pre : FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2, workspace->d2, i == 0 ? fresh_pre : FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, H->NT, workspace->z2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->d2, system->N, workspace->z2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Dot_local_rvec2( workspace->r_hat2, workspace->z2, system->n, &redux[0], &redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif tmp[0] = redux[0]; tmp[1] = redux[1]; alpha[0] = rho[0] / tmp[0]; alpha[1] = rho[1] / tmp[1]; Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n ); Dot_local_rvec2( workspace->q2, workspace->q2, system->n, &redux[0], &redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif tmp[0] = redux[0]; tmp[1] = redux[1]; /* early convergence check */ if ( tmp[0] < tol || tmp[1] < tol ) { Vector_Add_rvec2( x, alpha[0], alpha[1], workspace->d2, system->n ); break; } #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q2, workspace->y2, FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y2, workspace->q_hat2, FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2, H->NT, workspace->y2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat2, system->N, workspace->y2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Dot_local_rvec2( workspace->y2, workspace->q2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->y2, workspace->y2, system->n, &redux[2], &redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif sigma[0] = redux[0]; sigma[1] = redux[1]; tmp[0] = redux[2]; tmp[1] = redux[3]; omega[0] = sigma[0] / tmp[0]; omega[1] = sigma[1] / tmp[1]; Vector_Sum_rvec2( workspace->g2, alpha[0], alpha[1], workspace->d2, omega[0], omega[1], workspace->q_hat2, system->n ); Vector_Add_rvec2( x, 1.0, 1.0, workspace->g2, system->n ); Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->q2, -1.0 * omega[0], -1.0 * omega[1], workspace->y2, system->n ); Dot_local_rvec2( workspace->r2, workspace->r2, system->n, &redux[0], &redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif r_norm[0] = SQRT( redux[0] ); r_norm[1] = SQRT( redux[1] ); if ( omega[0] == 0.0 || omega[1] == 0.0 ) { break; } rho_old[0] = rho[0]; rho_old[1] = rho[1]; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } if ( (omega[0] == 0.0 || omega[1] == 0.0) && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i ); fprintf( stderr, " [INFO] omega = %e\n", omega ); } else if ( (rho[0] == 0.0 || rho[1] == 0.0) && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i ); fprintf( stderr, " [INFO] rho = %e\n", rho ); } /* continue to solve the system that has not converged yet */ if ( r_norm[0] / b_norm[0] > tol ) { Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); i += BiCGStab( system, control, data, workspace, H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); } else if ( r_norm[1] / b_norm[1] > tol ) { Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); i += BiCGStab( system, control, data, workspace, H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i ); fprintf( stderr, " [INFO] Rel. residual error (s solve): %e\n", r_norm[0] / b_norm[0] ); fprintf( stderr, " [INFO] Rel. residual error (t solve): %e\n", r_norm[1] / b_norm[1] ); } return i; } /* Bi-conjugate gradient stabalized method with left preconditioning for * solving nonsymmetric linear systems * * system: * workspace: struct containing storage for workspace for the linear solver * control: struct containing parameters governing the simulation and numeric methods * data: struct containing simulation data (e.g., atom info) * H: sparse, symmetric matrix, lower half stored in CSR format * b: right-hand side of the linear system * tol: tolerence compared against the relative residual for determining convergence * x: inital guess * mpi_data: * * Reference: Netlib (in MATLAB) * http://www.netlib.org/templates/matlab/bicgstab.m * */ int BiCGStab( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, real * const b, real tol, real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; real tmp, alpha, beta, omega, sigma, rho, rho_old, r_norm, b_norm; real time, redux[2]; #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->d ); #else Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->d ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->d, system->n ); redux[0] = Dot_local( b, b, system->n ); redux[1] = Dot_local( workspace->r, workspace->r, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif b_norm = SQRT( redux[0] ); r_norm = SQRT( redux[1] ); if ( b_norm == 0.0 ) { b_norm = 1.0; } Vector_Copy( workspace->r_hat, workspace->r, system->n ); omega = 1.0; rho = 1.0; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { redux[0] = Dot_local( workspace->r_hat, workspace->r, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif rho = redux[0]; if ( rho == 0.0 ) { break; } if ( i > 0 ) { beta = (rho / rho_old) * (alpha / omega); Vector_Sum( workspace->q, 1.0, workspace->p, -1.0 * omega, workspace->z, system->n ); Vector_Sum( workspace->p, 1.0, workspace->r, beta, workspace->q, system->n ); } else { Vector_Copy( workspace->p, workspace->r, system->n ); } #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->p, workspace->y, i == 0 ? fresh_pre : FALSE, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y, workspace->d, i == 0 ? fresh_pre : FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, H->NT, workspace->z ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->d, system->N, workspace->z ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->r_hat, workspace->z, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif tmp = redux[0]; alpha = rho / tmp; Vector_Sum( workspace->q, 1.0, workspace->r, -1.0 * alpha, workspace->z, system->n ); redux[0] = Dot_local( workspace->q, workspace->q, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif tmp = redux[0]; /* early convergence check */ if ( tmp < tol ) { Vector_Add( x, alpha, workspace->d, system->n ); break; } #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->q, workspace->y, FALSE, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->y, workspace->q_hat, FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat, H->NT, workspace->y ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->q_hat, system->N, workspace->y ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->y, workspace->q, system->n ); redux[1] = Dot_local( workspace->y, workspace->y, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif sigma = redux[0]; tmp = redux[1]; omega = sigma / tmp; Vector_Sum( workspace->g, alpha, workspace->d, omega, workspace->q_hat, system->n ); Vector_Add( x, 1.0, workspace->g, system->n ); Vector_Sum( workspace->r, 1.0, workspace->q, -1.0 * omega, workspace->y, system->n ); redux[0] = Dot_local( workspace->r, workspace->r, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Allreduce( MPI_IN_PLACE, redux, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif r_norm = SQRT( redux[0] ); if ( omega == 0.0 ) { break; } rho_old = rho; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } if ( omega == 0.0 && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i ); fprintf( stderr, " [INFO] omega = %e\n", omega ); } else if ( rho == 0.0 && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] BiCGStab numeric breakdown (%d iters)\n", i ); fprintf( stderr, " [INFO] rho = %e\n", rho ); } else if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] BiCGStab convergence failed (%d iters)\n", i ); fprintf( stderr, " [INFO] Rel. residual error: %e\n", r_norm / b_norm ); } return i; } /* Dual iteration for the Pipelined Preconditioned Conjugate Gradient Method * for QEq (2 simaltaneous solves) * * References: * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, * P. Ghysels and W. Vanroose, Parallel Computing, 2014. * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods, * Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference * for High Performance Computing, Networking, Storage and Analysis, 2016. * */ int dual_PIPECG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, rvec2 * const b, real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; rvec2 alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm; real redux[8]; MPI_Request req; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->u2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->u2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->u2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, workspace->m2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->m2, workspace->u2, fresh_pre, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2, H->NT, workspace->w2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2, system->N, workspace->w2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif for ( j = 0; j < 8; ++j ) { redux[j] = 0.0; } Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->r2, workspace->u2, system->n, &redux[2], &redux[3] ); Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] ); Dot_local_rvec2( b, b, system->n, &redux[6], &redux[7] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 8, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, workspace->n2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, workspace->m2, fresh_pre, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, H->NT, workspace->n2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, system->N, workspace->n2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); delta[0] = redux[0]; delta[1] = redux[1]; gamma_new[0] = redux[2]; gamma_new[1] = redux[3]; r_norm[0] = SQRT( redux[4] ); r_norm[1] = SQRT( redux[5] ); b_norm[0] = SQRT( redux[6] ); b_norm[1] = SQRT( redux[7] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol ) { break; } if ( i > 0 ) { beta[0] = gamma_new[0] / gamma_old[0]; beta[1] = gamma_new[1] / gamma_old[1]; alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]); alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]); } else { beta[0] = 0.0; beta[1] = 0.0; alpha[0] = gamma_new[0] / delta[0]; alpha[1] = gamma_new[1] / delta[1]; } Vector_Sum_rvec2( workspace->z2, 1.0, 1.0, workspace->n2, beta[0], beta[1], workspace->z2, system->n ); Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->m2, beta[0], beta[1], workspace->q2, system->n ); Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->u2, beta[0], beta[1], workspace->p2, system->n ); Vector_Sum_rvec2( workspace->d2, 1.0, 1.0, workspace->w2, beta[0], beta[1], workspace->d2, system->n ); Vector_Sum_rvec2( x, 1.0, 1.0, x, alpha[0], alpha[1], workspace->p2, system->n ); Vector_Sum_rvec2( workspace->u2, 1.0, 1.0, workspace->u2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n ); Vector_Sum_rvec2( workspace->w2, 1.0, 1.0, workspace->w2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n ); Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->d2, system->n ); for ( j = 0; j < 6; ++j ) { redux[j] = 0.0; } Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->r2, workspace->u2, system->n, &redux[2], &redux[3] ); Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, workspace->n2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, workspace->m2, fresh_pre, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, H->NT, workspace->n2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, system->N, workspace->n2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif gamma_old[0] = gamma_new[0]; gamma_old[1] = gamma_new[1]; ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); delta[0] = redux[0]; delta[1] = redux[1]; gamma_new[0] = redux[2]; gamma_new[1] = redux[3]; r_norm[0] = SQRT( redux[4] ); r_norm[1] = SQRT( redux[5] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif } /* continue to solve the system that has not converged yet */ if ( r_norm[0] / b_norm[0] > tol ) { Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); i += PIPECG( system, control, data, workspace, H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); } else if ( r_norm[1] / b_norm[1] > tol ) { Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); i += PIPECG( system, control, data, workspace, H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" ); return i; } return i; } /* Pipelined Preconditioned Conjugate Gradient Method * * References: * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, * P. Ghysels and W. Vanroose, Parallel Computing, 2014. * 2) Scalable Non-blocking Preconditioned Conjugate Gradient Methods, * Paul R. Eller and William Gropp, SC '16 Proceedings of the International Conference * for High Performance Computing, Networking, Storage and Analysis, 2016. * */ int PIPECG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, real * const b, real tol, real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; real alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm; real redux[4]; MPI_Request req; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->u ); #else Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->u ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->u, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, workspace->m, fresh_pre, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->m, workspace->u, fresh_pre, RIGHT ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->u, H->NT, workspace->w ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->u, system->N, workspace->w ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->w, workspace->u, system->n ); redux[1] = Dot_local( workspace->r, workspace->u, system->n ); redux[2] = Dot_local( workspace->u, workspace->u, system->n ); redux[3] = Dot_local( b, b, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w, workspace->n, FALSE, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n, workspace->m, FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->m, H->NT, workspace->n ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->m, system->N, workspace->n ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); delta = redux[0]; gamma_new = redux[1]; r_norm = SQRT( redux[2] ); b_norm = SQRT( redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { if ( i > 0 ) { beta = gamma_new / gamma_old; alpha = gamma_new / (delta - beta / alpha * gamma_new); } else { beta = 0.0; alpha = gamma_new / delta; } Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n ); Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n ); Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n ); Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n ); Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n ); Vector_Sum( workspace->u, 1.0, workspace->u, -1.0 * alpha, workspace->q, system->n ); Vector_Sum( workspace->w, 1.0, workspace->w, -1.0 * alpha, workspace->z, system->n ); Vector_Sum( workspace->r, 1.0, workspace->r, -1.0 * alpha, workspace->d, system->n ); redux[0] = Dot_local( workspace->w, workspace->u, system->n ); redux[1] = Dot_local( workspace->r, workspace->u, system->n ); redux[2] = Dot_local( workspace->u, workspace->u, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w, workspace->n, FALSE, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n, workspace->m, FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->m, H->NT, workspace->n ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->m, system->N, workspace->n ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif gamma_old = gamma_new; ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); delta = redux[0]; gamma_new = redux[1]; r_norm = SQRT( redux[2] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] PIPECG convergence failed!\n" ); return i; } return i; } /* Pipelined Preconditioned Conjugate Residual Method. * This function performs dual iteration for QEq (2 simultaneous solves) * * References: * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, * P. Ghysels and W. Vanroose, Parallel Computing, 2014. * */ int dual_PIPECR( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, rvec2 * const b, real tol, rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; rvec2 alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm; real redux[6]; MPI_Request req; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->u2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->u2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, b, -1.0, -1.0, workspace->u2, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r2, workspace->n2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, workspace->u2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Dot_local_rvec2( b, b, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[2], &redux[3] ); ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 4, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2, H->NT, workspace->w2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->u2, system->N, workspace->w2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); b_norm[0] = SQRT( redux[0] ); b_norm[1] = SQRT( redux[1] ); r_norm[0] = SQRT( redux[2] ); r_norm[1] = SQRT( redux[3] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters; ++i ) { if ( r_norm[0] / b_norm[0] <= tol || r_norm[1] / b_norm[1] <= tol ) { break; } dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, workspace->n2, fresh_pre, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, workspace->m2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Dot_local_rvec2( workspace->w2, workspace->u2, system->n, &redux[0], &redux[1] ); Dot_local_rvec2( workspace->m2, workspace->w2, system->n, &redux[2], &redux[3] ); Dot_local_rvec2( workspace->u2, workspace->u2, system->n, &redux[4], &redux[5] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 6, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, H->NT, workspace->n2 ); #else Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, system->N, workspace->n2 ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); gamma_new[0] = redux[0]; gamma_new[1] = redux[1]; delta[0] = redux[2]; delta[1] = redux[3]; r_norm[0] = SQRT( redux[4] ); r_norm[1] = SQRT( redux[5] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif if ( i > 0 ) { beta[0] = gamma_new[0] / gamma_old[0]; beta[1] = gamma_new[1] / gamma_old[1]; alpha[0] = gamma_new[0] / (delta[0] - beta[0] / alpha[0] * gamma_new[0]); alpha[1] = gamma_new[1] / (delta[1] - beta[1] / alpha[1] * gamma_new[1]); } else { beta[0] = 0.0; beta[1] = 0.0; alpha[0] = gamma_new[0] / delta[0]; alpha[1] = gamma_new[1] / delta[1]; } Vector_Sum_rvec2( workspace->z2, 1.0, 1.0, workspace->n2, beta[0], beta[1], workspace->z2, system->n ); Vector_Sum_rvec2( workspace->q2, 1.0, 1.0, workspace->m2, beta[0], beta[1], workspace->q2, system->n ); Vector_Sum_rvec2( workspace->p2, 1.0, 1.0, workspace->u2, beta[0], beta[1], workspace->p2, system->n ); Vector_Sum_rvec2( workspace->d2, 1.0, 1.0, workspace->w2, beta[0], beta[1], workspace->d2, system->n ); Vector_Sum_rvec2( x, 1.0, 1.0, x, alpha[0], alpha[1], workspace->p2, system->n ); Vector_Sum_rvec2( workspace->u2, 1.0, 1.0, workspace->u2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->q2, system->n ); Vector_Sum_rvec2( workspace->w2, 1.0, 1.0, workspace->w2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->z2, system->n ); Vector_Sum_rvec2( workspace->r2, 1.0, 1.0, workspace->r2, -1.0 * alpha[0], -1.0 * alpha[1], workspace->d2, system->n ); gamma_old[0] = gamma_new[0]; gamma_old[1] = gamma_new[1]; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } /* continue to solve the system that has not converged yet */ if ( r_norm[0] / b_norm[0] > tol ) { Vector_Copy_From_rvec2( workspace->s, workspace->x, 0, system->n ); i += PIPECR( system, control, data, workspace, H, workspace->b_s, tol, workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->s, 0, system->n ); } else if ( r_norm[1] / b_norm[1] > tol ) { Vector_Copy_From_rvec2( workspace->t, workspace->x, 1, system->n ); i += PIPECR( system, control, data, workspace, H, workspace->b_t, tol, workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->x, workspace->t, 1, system->n ); } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" ); return i; } return i; } /* Pipelined Preconditioned Conjugate Residual Method * * References: * 1) Hiding global synchronization latency in the preconditioned Conjugate Gradient algorithm, * P. Ghysels and W. Vanroose, Parallel Computing, 2014. * */ int PIPECR( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix * const H, real * const b, real tol, real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { int i, j, ret; real alpha, beta, delta, gamma_old, gamma_new, r_norm, b_norm; real redux[3]; MPI_Request req; #if defined(LOG_PERFORMANCE) real time; #endif #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, x, H->NT, workspace->u ); #else Sparse_MatVec( system, control, data, mpi_data, H, x, system->N, workspace->u ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif Vector_Sum( workspace->r, 1.0, b, -1.0, workspace->u, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif apply_preconditioner( system, workspace, control, data, mpi_data, workspace->r, workspace->n, fresh_pre, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n, workspace->u, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( b, b, system->n ); redux[1] = Dot_local( workspace->u, workspace->u, system->n ); ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->u, H->NT, workspace->w ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->u, system->N, workspace->w ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); b_norm = SQRT( redux[0] ); r_norm = SQRT( redux[1] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w, workspace->n, fresh_pre, LEFT ); apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n, workspace->m, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif redux[0] = Dot_local( workspace->w, workspace->u, system->n ); redux[1] = Dot_local( workspace->m, workspace->w, system->n ); redux[2] = Dot_local( workspace->u, workspace->u, system->n ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif ret = MPI_Iallreduce( MPI_IN_PLACE, redux, 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); #if defined(NEUTRAL_TERRITORY) Sparse_MatVec( system, control, data, mpi_data, H, workspace->m, H->NT, workspace->n ); #else Sparse_MatVec( system, control, data, mpi_data, H, workspace->m, system->N, workspace->n ); #endif #if defined(LOG_PERFORMANCE) time = Get_Time( ); #endif ret = MPI_Wait( &req, MPI_STATUS_IGNORE ); Check_MPI_Error( ret, __FILE__, __LINE__ ); gamma_new = redux[0]; delta = redux[1]; r_norm = SQRT( redux[2] ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_allreduce ); #endif if ( i > 0 ) { beta = gamma_new / gamma_old; alpha = gamma_new / (delta - beta / alpha * gamma_new); } else { beta = 0.0; alpha = gamma_new / delta; } Vector_Sum( workspace->z, 1.0, workspace->n, beta, workspace->z, system->n ); Vector_Sum( workspace->q, 1.0, workspace->m, beta, workspace->q, system->n ); Vector_Sum( workspace->p, 1.0, workspace->u, beta, workspace->p, system->n ); Vector_Sum( workspace->d, 1.0, workspace->w, beta, workspace->d, system->n ); Vector_Sum( x, 1.0, x, alpha, workspace->p, system->n ); Vector_Sum( workspace->u, 1.0, workspace->u, -1.0 * alpha, workspace->q, system->n ); Vector_Sum( workspace->w, 1.0, workspace->w, -1.0 * alpha, workspace->z, system->n ); Vector_Sum( workspace->r, 1.0, workspace->r, -1.0 * alpha, workspace->d, system->n ); gamma_old = gamma_new; #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif } if ( i >= control->cm_solver_max_iters && system->my_rank == MASTER_NODE ) { fprintf( stderr, "[WARNING] PIPECR convergence failed!\n" ); return i; } return i; }