/*---------------------------------------------------------------------- SerialReax - Reax Force Field Simulator Copyright (2010) Purdue University Hasan Metin Aktulga, haktulga@cs.purdue.edu Joseph Fogarty, jcfogart@mail.usf.edu Sagar Pandit, pandit@usf.edu Ananth Y Grama, ayg@cs.purdue.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details: <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ #include "QEq.h" #include "allocate.h" #include "GMRES.h" #include "list.h" #include "print_utils.h" int compare_matrix_entry(const void *v1, const void *v2) { /* larger element has larger column index */ return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; } void Sort_Matrix_Rows( sparse_matrix *A ) { int i, si, ei; /* sort each row of A using column indices */ for ( i = 0; i < A->n; ++i ) { si = A->start[i]; ei = A->start[i + 1]; /* polymorphic sort in standard C library */ qsort( &(A->entries[si]), ei - si, sizeof(sparse_matrix_entry), compare_matrix_entry ); } } void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol ) { int i, j, k; real val; /* init droptol to 0 */ for ( i = 0; i < A->n; ++i ) droptol[i] = 0; /* calculate sqaure of the norm of each row */ for ( i = 0; i < A->n; ++i ) { for ( k = A->start[i]; k < A->start[i + 1] - 1; ++k ) { j = A->entries[k].j; val = A->entries[k].val; droptol[i] += val * val; droptol[j] += val * val; } val = A->entries[k].val; // diagonal entry droptol[i] += val * val; } /* calculate local droptol for each row */ //fprintf( stderr, "droptol: " ); for ( i = 0; i < A->n; ++i ) { //fprintf( stderr, "%f-->", droptol[i] ); droptol[i] = SQRT( droptol[i] ) * dtol; //fprintf( stderr, "%f ", droptol[i] ); } //fprintf( stderr, "\n" ); } int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) { int i, j, pj; int fillin; real val; fillin = 0; //fprintf( stderr, "n: %d\n", A->n ); for ( i = 0; i < A->n; ++i ) for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) { j = A->entries[pj].j; val = A->entries[pj].val; //fprintf( stderr, "i: %d, j: %d", i, j ); if ( fabs(val) > droptol[i] ) ++fillin; } return fillin + A->n; } /* Incomplete Cholesky factorization with thresholding */ real ICHOLT( sparse_matrix *A, real *droptol, sparse_matrix *L, sparse_matrix *U ) { sparse_matrix_entry tmp[1000]; int i, j, pj, k1, k2, tmptop, Ltop; real val; int *Utop; struct timeval start, stop; gettimeofday( &start, NULL ); Utop = (int*) malloc((A->n + 1) * sizeof(int)); // clear variables Ltop = 0; tmptop = 0; for ( i = 0; i <= A->n; ++i ) L->start[i] = U->start[i] = 0; for ( i = 0; i < A->n; ++i ) Utop[i] = 0; //fprintf( stderr, "n: %d\n", A->n ); for ( i = 0; i < A->n; ++i ) { L->start[i] = Ltop; tmptop = 0; for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) { j = A->entries[pj].j; val = A->entries[pj].val; //fprintf( stderr, "i: %d, j: %d", i, j ); if ( fabs(val) > droptol[i] ) { k1 = 0; k2 = L->start[j]; while ( k1 < tmptop && k2 < L->start[j + 1] ) { if ( tmp[k1].j < L->entries[k2].j ) ++k1; else if ( tmp[k1].j > L->entries[k2].j ) ++k2; else val -= (tmp[k1++].val * L->entries[k2++].val); } // L matrix is lower triangular, // so right before the start of next row comes jth diagonal val /= L->entries[L->start[j + 1] - 1].val; tmp[tmptop].j = j; tmp[tmptop].val = val; ++tmptop; } //fprintf( stderr, " -- done\n" ); } // compute the ith diagonal in L // sanity check if ( A->entries[pj].j != i ) { fprintf( stderr, "i=%d, badly built A matrix!\n", i ); exit(999); } val = A->entries[pj].val; for ( k1 = 0; k1 < tmptop; ++k1 ) val -= (tmp[k1].val * tmp[k1].val); tmp[tmptop].j = i; tmp[tmptop].val = SQRT(val); // apply the dropping rule once again //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); //for( k1 = 0; k1<= tmptop; ++k1 ) // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); //fprintf( stderr, "\n" ); //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); for ( k1 = 0; k1 < tmptop; ++k1 ) if ( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ) { L->entries[Ltop].j = tmp[k1].j; L->entries[Ltop].val = tmp[k1].val; U->start[tmp[k1].j + 1]++; ++Ltop; //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); } // keep the diagonal in any case L->entries[Ltop].j = tmp[k1].j; L->entries[Ltop].val = tmp[k1].val; ++Ltop; //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); } L->start[i] = Ltop; // fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); /* U = L^T (Cholesky factorization) */ for ( i = 1; i <= U->n; ++i ) { Utop[i] = U->start[i] = U->start[i] + U->start[i - 1] + 1; } for ( i = 0; i < L->n; ++i ) { for ( pj = L->start[i]; pj < L->start[i + 1]; ++pj ) { j = L->entries[pj].j; U->entries[Utop[j]].j = i; U->entries[Utop[j]].val = L->entries[pj].val; Utop[j]++; } } // fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); free(Utop); gettimeofday( &stop, NULL ); return (stop.tv_sec + stop.tv_usec / 1000000.0) - (start.tv_sec + start.tv_usec / 1000000.0); } /* Fine-grained (parallel) incomplete Cholesky factorization * * Reference: * Edmond Chow and Aftab Patel * Fine-Grained Parallel Incomplete LU Factorization * SIAM J. Sci. Comp. */ static void ICHOL_PAR( const sparse_matrix * const A, const unsigned int sweeps, sparse_matrix * const U_t, sparse_matrix * const U ) { unsigned int i, j, k, pj, x = 0, y = 0, ei_x, ei_y; real *D, *D_inv, sum; sparse_matrix *DAD; int *Utop; if ( Allocate_Matrix( &DAD, A->n, A->m ) == 0 ) { fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); exit(INSUFFICIENT_SPACE); } D = (real*) malloc(A->n * sizeof(real)); D_inv = (real*) malloc(A->n * sizeof(real)); Utop = (int*) malloc((A->n + 1) * sizeof(int)); for ( i = 0; i < A->n; ++i ) { D_inv[i] = SQRT( A->entries[A->start[i + 1] - 1].val ); D[i] = 1. / D_inv[i]; } for ( i = 0; i <= A->n; ++i ) { U->start[i] = 0; Utop[i] = 0; } /* to get convergence, A must have unit diagonal, so apply * transformation DAD, where D = D(1./sqrt(D(A))) */ memcpy( DAD->start, A->start, sizeof(int) * (A->n + 1) ); for ( i = 0; i < A->n; ++i ) { /* non-diagonals */ for ( pj = A->start[i]; pj < A->start[i + 1] - 1; ++pj ) { DAD->entries[pj].j = A->entries[pj].j; DAD->entries[pj].val = A->entries[pj].val * D[i] * D[A->entries[pj].j]; } /* diagonal */ DAD->entries[pj].j = A->entries[pj].j; DAD->entries[pj].val = 1.; } /* initial guesses for U^T, * assume: A and DAD symmetric and stored lower triangular */ memcpy( U_t->start, DAD->start, sizeof(int) * (DAD->n + 1) ); memcpy( U_t->entries, DAD->entries, sizeof(sparse_matrix_entry) * (DAD->m) ); for ( i = 0; i < sweeps; ++i ) { /* for each nonzero */ #pragma omp parallel for schedule(guided) \ default(none) shared(DAD) private(sum, ei_x, ei_y, k) firstprivate(x, y) for ( j = 0; j < A->start[A->n]; ++j ) { sum = ZERO; /* determine row bounds of current nonzero */ x = 0; ei_x = 0; for ( k = 0; k <= A->n; ++k ) { if ( U_t->start[k] > j ) { x = U_t->start[k - 1]; ei_x = U_t->start[k]; break; } } /* column bounds of current nonzero */ y = U_t->start[U_t->entries[j].j]; ei_y = U_t->start[U_t->entries[j].j + 1]; /* sparse dot product: dot( U^T(i,1:j-1), U^T(j,1:j-1) ) */ while ( U_t->entries[x].j < U_t->entries[j].j && U_t->entries[y].j < U_t->entries[j].j && x < ei_x && y < ei_y ) { if ( U_t->entries[x].j == U_t->entries[y].j ) { sum += (U_t->entries[x].val * U_t->entries[y].val); ++x; ++y; } else if ( U_t->entries[x].j < U_t->entries[y].j ) { ++x; } else { ++y; } } sum = DAD->entries[j].val - sum; /* diagonal entries */ if ( (k - 1) == U_t->entries[j].j ) { /* sanity check */ if ( sum < ZERO ) { fprintf( stderr, "Numeric breakdown in ICHOL Terminating.\n"); #if defined(DEBUG_FOCUS) fprintf( stderr, "A(%5d,%5d) = %10.3f\n", k - 1, A->entries[j].j, A->entries[j].val ); fprintf( stderr, "sum = %10.3f\n", sum); #endif exit(NUMERIC_BREAKDOWN); } U_t->entries[j].val = SQRT( sum ); } /* non-diagonal entries */ else { U_t->entries[j].val = sum / U_t->entries[ei_y - 1].val; } } } /* apply inverse transformation D^{-1}U^{T}, * since DAD \approx U^{T}U, so * D^{-1}DADD^{-1} = A \approx D^{-1}U^{T}UD^{-1} */ for ( i = 0; i < A->n; ++i ) { for ( pj = A->start[i]; pj < A->start[i + 1]; ++pj ) { U_t->entries[pj].val *= D_inv[i]; } } #if defined(DEBUG_FOCUS) fprintf( stderr, "nnz(L): %d, max: %d\n", U_t->start[U_t->n], U_t->n * 50 ); #endif /* transpose U^{T} and copy into U */ for ( i = 0; i < U_t->n; ++i ) { /* count nonzeros in each row of U (columns of U^{T}), * store in U->start */ for ( pj = U_t->start[i]; pj < U_t->start[i + 1]; ++pj ) { U->start[U_t->entries[pj].j + 1]++; } } /* set correct row pointer in U */ for ( i = 1; i <= U->n; ++i ) { Utop[i] = U->start[i] = U->start[i] + U->start[i - 1]; } /* for each row */ for ( i = 0; i < U_t->n; ++i ) { /* for each nonzero (column-wise) in U^T */ for ( pj = U_t->start[i]; pj < U_t->start[i + 1]; ++pj ) { j = U_t->entries[pj].j; U->entries[Utop[j]].j = i; U->entries[Utop[j]].val = U_t->entries[pj].val; /* Utop contains pointer within rows of U * (columns of U^T) to add next nonzero, so increment */ Utop[j]++; } } #if defined(DEBUG_FOCUS) fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); #endif Deallocate_Matrix( DAD ); free(D_inv); free(D); free(Utop); } void Init_MatVec( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list *far_nbrs ) { int i, fillin; real s_tmp, t_tmp; // char fname[100]; if (control->refactor > 0 && ((data->step - data->prev_steps) % control->refactor == 0 || workspace->L == NULL)) { //Print_Linear_System( system, control, workspace, data->step ); Sort_Matrix_Rows( workspace->H ); Sort_Matrix_Rows( workspace->H_sp ); //fprintf( stderr, "H matrix sorted\n" ); Calculate_Droptol( workspace->H, workspace->droptol, control->droptol ); //fprintf( stderr, "drop tolerances calculated\n" ); if ( workspace->L == NULL ) { fillin = Estimate_LU_Fill( workspace->H, workspace->droptol ); if ( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ) { fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); exit(INSUFFICIENT_SPACE); } /* factors have sparsity pattern as H */ // if ( Allocate_Matrix( &(workspace->L), workspace->H->n, workspace->H->m ) == 0 || // Allocate_Matrix( &(workspace->U), workspace->H->n, workspace->H->m ) == 0 ) // { // fprintf( stderr, "not enough memory for preconditioning matrices. terminating.\n" ); // exit(INSUFFICIENT_SPACE); // } #if defined(DEBUG_FOCUS) fprintf( stderr, "fillin = %d\n", fillin ); fprintf( stderr, "allocated memory: L = U = %ldMB\n", fillin * sizeof(sparse_matrix_entry) / (1024 * 1024) ); #endif } data->timing.pre_comp += ICHOLT( workspace->H, workspace->droptol, workspace->L, workspace->U ); // data->timing.pre_comp += ICHOLT( workspace->H_sp, workspace->droptol, workspace->L, workspace->U ); // TODO: add parameters for sweeps to control file // ICHOL_PAR( workspace->H, 1, workspace->L, workspace->U ); // fprintf( stderr, "condest = %f\n", condest(workspace->L, workspace->U) ); #if defined(DEBUG_FOCUS) sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->L, fname ); sprintf( fname, "%s.U%d.out", control->sim_name, data->step ); Print_Sparse_Matrix2( workspace->U, fname ); fprintf( stderr, "icholt-" ); //sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); //Print_Sparse_Matrix2( workspace->L, fname ); //Print_Sparse_Matrix( U ); #endif } /* extrapolation for s & t */ for ( i = 0; i < system->N; ++i ) { // no extrapolation //s_tmp = workspace->s[0][i]; //t_tmp = workspace->t[0][i]; // linear //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; // quadratic //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); t_tmp = workspace->t[2][i] + 3 * (workspace->t[0][i] - workspace->t[1][i]); // cubic s_tmp = 4 * (workspace->s[0][i] + workspace->s[2][i]) - (6 * workspace->s[1][i] + workspace->s[3][i] ); //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - // (6 * workspace->t[1][i] + workspace->t[3][i] ); // 4th order //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; workspace->s[4][i] = workspace->s[3][i]; workspace->s[3][i] = workspace->s[2][i]; workspace->s[2][i] = workspace->s[1][i]; workspace->s[1][i] = workspace->s[0][i]; workspace->s[0][i] = s_tmp; workspace->t[4][i] = workspace->t[3][i]; workspace->t[3][i] = workspace->t[2][i]; workspace->t[2][i] = workspace->t[1][i]; workspace->t[1][i] = workspace->t[0][i]; workspace->t[0][i] = t_tmp; } } void Calculate_Charges( reax_system *system, static_storage *workspace ) { int i; real u, s_sum, t_sum; s_sum = t_sum = 0.; for ( i = 0; i < system->N; ++i ) { s_sum += workspace->s[0][i]; t_sum += workspace->t[0][i]; } u = s_sum / t_sum; for ( i = 0; i < system->N; ++i ) system->atoms[i].q = workspace->s[0][i] - u * workspace->t[0][i]; } void QEq( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list *far_nbrs, output_controls *out_control ) { int matvecs; Init_MatVec( system, control, data, workspace, far_nbrs ); // if( data->step == 0 || data->step == 100 ) // Print_Linear_System( system, control, workspace, data->step ); //TODO: add parameters in control file for solver choice and options // matvecs = GMRES( workspace, workspace->H, // workspace->b_s, control->q_err, workspace->s[0], out_control->log, &(data->timing.pre_app) ); // matvecs += GMRES( workspace, workspace->H, // workspace->b_t, control->q_err, workspace->t[0], out_control->log, &(data->timing.pre_app) ); // matvecs = GMRES_HouseHolder( workspace, workspace->H, // workspace->b_s, control->q_err, workspace->s[0], out_control->log ); // matvecs += GMRES_HouseHolder( workspace, workspace->H, // workspace->b_t, control->q_err, workspace->t[0], out_control->log ); matvecs = PGMRES( workspace, workspace->H, workspace->b_s, control->q_err, workspace->L, workspace->U, workspace->s[0], out_control->log, &(data->timing.pre_app) ); matvecs += PGMRES( workspace, workspace->H, workspace->b_t, control->q_err, workspace->L, workspace->U, workspace->t[0], out_control->log, &(data->timing.pre_app) ); // matvecs = PGMRES_Jacobi( workspace, workspace->H, workspace->b_s, control->q_err, // workspace->L, workspace->U, workspace->s[0], out_control->log, &(data->timing.pre_app) ); // matvecs += PGMRES_Jacobi( workspace, workspace->H, workspace->b_t, control->q_err, // workspace->L, workspace->U, workspace->t[0], out_control->log, &(data->timing.pre_app) ); //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, // workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1; ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, // workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1; //matvecs = CG( workspace, workspace->H, // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; //matvecs += CG( workspace, workspace->H, // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; //matvecs = SDM( workspace, workspace->H, // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; //matvecs += SDM( workspace, workspace->H, // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; data->timing.matvecs += matvecs; #if defined(DEBUG_FOCUS) fprintf( stderr, "linsolve-" ); #endif Calculate_Charges( system, workspace ); //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", // data->step, // workspace->s[0][0], workspace->t[0][0], // workspace->s[0][1], workspace->t[0][1], // workspace->s[0][2], workspace->t[0][2] ); // if( data->step == control->nsteps ) //Print_Charges( system, control, workspace, data->step ); }