diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index 236bc228ee1b6f5385c2dfa3e4be159a1659ac72..619cee2d06ddc676dec6d87f7f747b50a9008a7b 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -538,14 +538,30 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control case SDM_S: sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); #if defined(DUAL_SOLVER) sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); #endif break; @@ -580,70 +596,178 @@ void Cuda_Allocate_Workspace_Part2( reax_system *system, control_params *control case BiCGStab_S: sCudaMalloc( (void **) &workspace->y, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->y, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->g, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->g, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->z, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->z, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r_hat, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r_hat, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q_hat, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q_hat, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); #if defined(DUAL_SOLVER) sCudaMalloc( (void **) &workspace->y2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->y2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->g2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->g2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->z2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->z2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r_hat2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r_hat2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q_hat2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q_hat2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); #endif break; case PIPECG_S: sCudaMalloc( (void **) &workspace->z, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->z, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->m, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->m, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->n, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->n, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->u, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->u, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->w, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->w, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); #if defined(DUAL_SOLVER) sCudaMalloc( (void **) &workspace->z2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->z2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->m2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->m2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->n2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->n2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->u2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->u2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->w2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->w2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); #endif break; case PIPECR_S: sCudaMalloc( (void **) &workspace->z, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->z, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->m, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->m, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->n, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->n, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->u, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->u, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->w, total_real, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->w, FALSE, total_real, + control->streams[0], __FILE__, __LINE__ ); #if defined(DUAL_SOLVER) sCudaMalloc( (void **) &workspace->z2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->z2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->r2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->r2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->d2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->d2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->q2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->q2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->p2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->p2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->m2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->m2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->n2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->n2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->u2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->u2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); sCudaMalloc( (void **) &workspace->w2, total_rvec2, __FILE__, __LINE__ ); + sCudaMemsetAsync( workspace->w2, FALSE, total_rvec2, + control->streams[0], __FILE__, __LINE__ ); #endif break; diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 06bb3d3679b654db14eaebc98f22745f8bc4052b..81c00c7f28ce803f62edee3053fb6147d5743715 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -21,12 +21,16 @@ #include "cuda_charges.h" +#include "cuda_allocate.h" +#include "cuda_copy.h" #include "cuda_reduction.h" #include "cuda_spar_lin_alg.h" #include "cuda_utils.h" +#include "../allocate.h" #include "../charges.h" #include "../comm_tools.h" +#include "../lin_alg.h" #include "../tool_box.h" #if !defined(MPIX_CUDA_AWARE_SUPPORT) || !MPIX_CUDA_AWARE_SUPPORT @@ -305,9 +309,27 @@ static void Setup_Preconditioner_QEq( reax_system const * const system, break; case SAI_PC: -// setup_sparse_approx_inverse( system, data, workspace, mpi_data, -// &workspace->H, &workspace->H_spar_patt, -// control->nprocs, control->cm_solver_pre_comp_sai_thres ); + if ( workspace->H.allocated == FALSE ) + { + Allocate_Matrix( &workspace->H, + workspace->d_workspace->H.n, workspace->d_workspace->H.n_max, + workspace->d_workspace->H.m, workspace->d_workspace->H.format ); + } + else if ( workspace->H.m < workspace->d_workspace->H.m + || workspace->H.n_max < workspace->d_workspace->H.n_max ) + { + Deallocate_Matrix( &workspace->H ); + Allocate_Matrix( &workspace->H, + workspace->d_workspace->H.n, workspace->d_workspace->H.n_max, + workspace->d_workspace->H.m, workspace->d_workspace->H.format ); + } + + Cuda_Copy_Matrix_Device_to_Host( &workspace->H, &workspace->d_workspace->H, + control->streams[4] ); + + setup_sparse_approx_inverse( system, data, workspace, mpi_data, + &workspace->H, &workspace->H_spar_patt, + control->nprocs, control->cm_solver_pre_comp_sai_thres ); break; default: @@ -342,9 +364,10 @@ static void Setup_Preconditioner_ACKS2( reax_system const * const system, static void Compute_Preconditioner_QEq( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, - mpi_datatypes const * const mpi_data ) + mpi_datatypes * const mpi_data ) { #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) + int ret; real t_pc, total_pc; #endif @@ -355,16 +378,37 @@ static void Compute_Preconditioner_QEq( reax_system const * const system, else if ( control->cm_solver_pre_comp_type == SAI_PC ) { #if defined(HAVE_LAPACKE) || defined(HAVE_LAPACKE_MKL) -// t_pc = sparse_approx_inverse( system, data, workspace, mpi_data, -// &workspace->H, workspace->H_spar_patt, &workspace->H_app_inv, control->nprocs ); -// -// ret = MPI_Reduce( &t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD ); -// Check_MPI_Error( ret, __FILE__, __LINE__ ); -// -// if( system->my_rank == MASTER_NODE ) -// { -// data->timing.cm_solver_pre_comp += total_pc / control->nprocs; -// } + t_pc = sparse_approx_inverse( system, data, workspace, mpi_data, + &workspace->H, &workspace->H_spar_patt, + &workspace->H_app_inv, control->nprocs ); + + ret = MPI_Reduce( &t_pc, &total_pc, 1, MPI_DOUBLE, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD ); + Check_MPI_Error( ret, __FILE__, __LINE__ ); + + if ( workspace->d_workspace->H_app_inv.allocated == FALSE ) + { + Cuda_Allocate_Matrix( &workspace->d_workspace->H_app_inv, + workspace->H_app_inv.n, workspace->H_app_inv.n_max, + workspace->H_app_inv.m, workspace->H_app_inv.format, + control->streams[4] ); + } + else if ( workspace->d_workspace->H_app_inv.m < workspace->H_app_inv.m + || workspace->d_workspace->H_app_inv.n_max < workspace->H_app_inv.n_max ) + { + Cuda_Deallocate_Matrix( &workspace->d_workspace->H_app_inv ); + Cuda_Allocate_Matrix( &workspace->d_workspace->H_app_inv, + workspace->H_app_inv.n, workspace->H_app_inv.n_max, + workspace->H_app_inv.m, workspace->H_app_inv.format, + control->streams[4] ); + } + + Cuda_Copy_Matrix_Host_to_Device( &workspace->H_app_inv, + &workspace->d_workspace->H_app_inv, control->streams[4] ); + + if( system->my_rank == MASTER_NODE ) + { + data->timing.cm_solver_pre_comp += total_pc / control->nprocs; + } #else fprintf( stderr, "[ERROR] LAPACKE support disabled. Re-compile before enabling. Terminating...\n" ); exit( INVALID_INPUT ); @@ -638,11 +682,12 @@ void QEq( reax_system const * const system, control_params const * const control output_controls const * const out_control, mpi_datatypes * const mpi_data ) { - int iters; + int iters, refactor; iters = 0; + refactor = is_refactoring_step( control, data ); -// if ( is_refactoring_step( control, data ) == TRUE ) + if ( refactor == TRUE ) { Setup_Preconditioner_QEq( system, control, data, workspace, mpi_data ); @@ -690,12 +735,14 @@ void QEq( reax_system const * const system, control_params const * const control #if defined(DUAL_SOLVER) iters = Cuda_dual_CG( system, control, data, workspace, &workspace->d_workspace->H, workspace->d_workspace->b, - control->cm_solver_q_err, workspace->d_workspace->x, mpi_data ); + control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor ); #else iters = Cuda_CG( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, + mpi_data, refactor ); iters += Cuda_CG( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, + mpi_data, FALSE ); #endif break; @@ -703,12 +750,14 @@ void QEq( reax_system const * const system, control_params const * const control #if defined(DUAL_SOLVER) iters = Cuda_dual_SDM( system, control, data, workspace, &workspace->d_workspace->H, workspace->d_workspace->b, - control->cm_solver_q_err, workspace->d_workspace->x, mpi_data ); + control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor ); #else iters = Cuda_SDM( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, + mpi_data, refactor ); iters += Cuda_SDM( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, + mpi_data, FALSE ); #endif break; @@ -716,12 +765,14 @@ void QEq( reax_system const * const system, control_params const * const control #if defined(DUAL_SOLVER) iters = Cuda_dual_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H, workspace->d_workspace->b, - control->cm_solver_q_err, workspace->d_workspace->x, mpi_data ); + control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor ); #else iters = Cuda_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, + mpi_data, refactor ); iters += Cuda_BiCGStab( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, + mpi_data, FALSE ); #endif break; @@ -729,12 +780,14 @@ void QEq( reax_system const * const system, control_params const * const control #if defined(DUAL_SOLVER) iters = Cuda_dual_PIPECG( system, control, data, workspace, &workspace->d_workspace->H, workspace->d_workspace->b, - control->cm_solver_q_err, workspace->d_workspace->x, mpi_data ); + control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor ); #else iters = Cuda_PIPECG( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, + mpi_data, refactor ); iters += Cuda_PIPECG( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, + mpi_data, FALSE ); #endif break; @@ -742,12 +795,14 @@ void QEq( reax_system const * const system, control_params const * const control #if defined(DUAL_SOLVER) iters = Cuda_dual_PIPECR( system, control, data, workspace, &workspace->d_workspace->H, workspace->d_workspace->b, - control->cm_solver_q_err, workspace->d_workspace->x, mpi_data ); + control->cm_solver_q_err, workspace->d_workspace->x, mpi_data, refactor ); #else iters = Cuda_PIPECR( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->b_s, control->cm_solver_q_err, workspace->d_workspace->s, + mpi_data, refactor ); iters += Cuda_PIPECR( system, control, data, workspace, &workspace->d_workspace->H, - workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->b_t, control->cm_solver_q_err, workspace->d_workspace->t, + mpi_data, FALSE ); #endif break; diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu index 1163b0e977279c3f7433bc1ebd3f37ab21fef261..73d36a173ae7a7e9a2f57a187c8cfb9a2c782491 100644 --- a/PG-PuReMD/src/cuda/cuda_copy.cu +++ b/PG-PuReMD/src/cuda/cuda_copy.cu @@ -63,6 +63,23 @@ extern "C" void Cuda_Copy_Atoms_Host_to_Device( reax_system *system, control_par } +/* Copy sparse matrix from host to device */ +extern "C" void Cuda_Copy_Matrix_Host_to_Device( sparse_matrix const * const A, + sparse_matrix * const d_A, cudaStream_t s ) +{ + sCudaMemcpyAsync( d_A->start, A->start, sizeof(int) * A->n, + cudaMemcpyHostToDevice, s, __FILE__, __LINE__ ); + sCudaMemcpyAsync( d_A->end, A->end, sizeof(int) * A->n, + cudaMemcpyHostToDevice, s, __FILE__, __LINE__ ); + sCudaMemcpyAsync( d_A->j, A->j, sizeof(int) * A->m, + cudaMemcpyHostToDevice, s, __FILE__, __LINE__ ); + sCudaMemcpyAsync( d_A->val, A->val, sizeof(real) * A->m, + cudaMemcpyHostToDevice, s, __FILE__, __LINE__ ); + + cudaStreamSynchronize( s ); +} + + /* Copy atomic system info from host to device */ extern "C" void Cuda_Copy_System_Host_to_Device( reax_system *system, control_params *control ) @@ -115,6 +132,23 @@ extern "C" void Cuda_Copy_Atoms_Device_to_Host( reax_system *system, control_par } +/* Copy sparse matrix from device to host */ +extern "C" void Cuda_Copy_Matrix_Device_to_Host( sparse_matrix * const A, + sparse_matrix const * const d_A, cudaStream_t s ) +{ + sCudaMemcpyAsync( A->start, d_A->start, sizeof(int) * d_A->n, + cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + sCudaMemcpyAsync( A->end, d_A->end, sizeof(int) * d_A->n, + cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + sCudaMemcpyAsync( A->j, d_A->j, sizeof(int) * d_A->m, + cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + sCudaMemcpyAsync( A->val, d_A->val, sizeof(real) * d_A->m, + cudaMemcpyDeviceToHost, s, __FILE__, __LINE__ ); + + cudaStreamSynchronize( s ); +} + + /* Copy simulation data from device to host */ extern "C" void Cuda_Copy_Simulation_Data_Device_to_Host( control_params const * const control, simulation_data * const data, simulation_data * const d_data ) diff --git a/PG-PuReMD/src/cuda/cuda_copy.h b/PG-PuReMD/src/cuda/cuda_copy.h index a5c4eb30d1b665e91891de60a5adaaab85d38ed1..1ab0a0a526a29985d5453aa435e07656f2f036ec 100644 --- a/PG-PuReMD/src/cuda/cuda_copy.h +++ b/PG-PuReMD/src/cuda/cuda_copy.h @@ -10,6 +10,9 @@ extern "C" { void Cuda_Copy_Atoms_Host_to_Device( reax_system *, control_params * ); +void Cuda_Copy_Matrix_Host_to_Device( sparse_matrix const * const, + sparse_matrix * const, cudaStream_t ); + void Cuda_Copy_Grid_Host_to_Device( control_params *, grid *, grid * ); void Cuda_Copy_System_Host_to_Device( reax_system *, control_params * ); @@ -18,6 +21,9 @@ void Cuda_Copy_List_Device_to_Host( control_params *, reax_list *, reax_list *, void Cuda_Copy_Atoms_Device_to_Host( reax_system *, control_params * ); +void Cuda_Copy_Matrix_Device_to_Host( sparse_matrix * const, + sparse_matrix const * const, cudaStream_t ); + void Cuda_Copy_Simulation_Data_Device_to_Host( control_params const * const, simulation_data * const, simulation_data * const ); diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index bfe5ab44f6479ea98ef6642293b0dcd85fcd8ffa..92ca9043b8b296b31284dab3f784954185feb191 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -180,6 +180,19 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control, workspace->d_workspace->realloc.thbody = FALSE; workspace->d_workspace->realloc.gcell_atoms = 0; + if ( control->cm_solver_pre_comp_type == SAI_PC ) + { + workspace->H.allocated = FALSE; + workspace->H_full.allocated = FALSE; + workspace->H_spar_patt.allocated = FALSE; + workspace->H_spar_patt_full.allocated = FALSE; + workspace->H_app_inv.allocated = FALSE; + workspace->d_workspace->H_full.allocated = FALSE; + workspace->d_workspace->H_spar_patt.allocated = FALSE; + workspace->d_workspace->H_spar_patt_full.allocated = FALSE; + workspace->d_workspace->H_app_inv.allocated = FALSE; + } + Cuda_Reset_Workspace( system, control, workspace ); Init_Taper( control, workspace->d_workspace, mpi_data ); diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index 822d8bfbc8157fd9ca80aa8ab7b9155594ac23b1..1ad6fdc94ae10da7bbd6c8d5b7bf9614ba3b2d4f 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -40,6 +40,13 @@ #define FULL_MASK (0xFFFFFFFF) +enum preconditioner_type +{ + LEFT = 0, + RIGHT = 1, +}; + + /* Jacobi preconditioner computation */ CUDA_GLOBAL void k_jacobi_cm_half( int *row_ptr_start, int *row_ptr_end, int *col_ind, real *vals, @@ -727,7 +734,7 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system, static void Dual_Sparse_MatVec( const reax_system * const system, control_params const * const control, simulation_data * const data, storage * const workspace, mpi_datatypes * const mpi_data, - sparse_matrix const * const A, rvec2 * const x, + sparse_matrix const * const A, rvec2 const * const x, int n, rvec2 * const b ) { #if defined(LOG_PERFORMANCE) @@ -939,11 +946,549 @@ static void Sparse_MatVec( reax_system const * const system, } +/* Apply left-sided preconditioning while solving M^{-1}AX = M^{-1}B + * + * system: + * workspace: data struct containing matrices and vectors, stored in CSR + * control: data struct containing parameters + * data: struct containing timing simulation data (including performance data) + * y: vector to which to apply preconditioning, + * specific to internals of iterative solver being used + * x (output): preconditioned vector + * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner + * side: used in determining how to apply preconditioner if the preconditioner is + * factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU) + * + * Assumptions: + * Matrices have non-zero diagonals + * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ +static void dual_apply_preconditioner( reax_system const * const system, + storage * const workspace, control_params const * const control, + simulation_data * const data, mpi_datatypes * const mpi_data, + rvec2 const * const y, rvec2 * const x, int fresh_pre, int side ) +{ +// int i, si; + + /* no preconditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + if ( x != y ) + { + Vector_Copy_rvec2( x, y, system->n, control->streams[4] ); + } + } + else + { + switch ( side ) + { + case LEFT: + switch ( control->cm_solver_pre_app_type ) + { + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + dual_jacobi_apply( workspace->d_workspace->Hdia_inv, + y, x, system->n, control->streams[4] ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); +// break; + case SAI_PC: +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, H->NT, x ); +#else + Dual_Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, system->n, x ); +#endif + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + dual_jacobi_apply( workspace->d_workspace->Hdia_inv, + y, x, system->n, control->streams[4] ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); +// break; + case SAI_PC: +#if defined(NEUTRAL_TERRITORY) + Dual_Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, H->NT, x ); +#else + Dual_Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, system->n, x ); +#endif + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// for ( i = 0; i < workspace->H->n; ++i ) +// { +// workspace->y_p[i] = y[i]; +// } +// +// permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// // construct D^{-1}_L +// if ( fresh_pre == TRUE ) +// { +// for ( i = 0; i < workspace->L->n; ++i ) +// { +// si = workspace->L->start[i + 1] - 1; +// workspace->Dinv_L[i] = 1.0 / workspace->L->val[si]; +// } +// } +// +// jacobi_iter( workspace, workspace->L, workspace->Dinv_L, +// y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + } + break; + + case RIGHT: + switch ( control->cm_solver_pre_app_type ) + { + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + if ( x != y ) + { + Vector_Copy_rvec2( x, y, system->n, control->streams[4] ); + } + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve( workspace->U, y, x, workspace->U->n, UPPER ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + if ( x != y ) + { + Vector_Copy_rvec2( x, y, system->n, control->streams[4] ); + } + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); +// permute_vector( workspace, x, workspace->H->n, TRUE, UPPER ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// if ( fresh_pre == TRUE ) +// { +// for ( i = 0; i < workspace->U->n; ++i ) +// { +// si = workspace->U->start[i]; +// workspace->Dinv_U[i] = 1.0 / workspace->U->val[si]; +// } +// } +// +// jacobi_iter( workspace, workspace->U, workspace->Dinv_U, +// y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + } + break; + } + } +} + + +/* Apply left-sided preconditioning while solving M^{-1}Ax = M^{-1}b + * + * system: + * workspace: data struct containing matrices and vectors, stored in CSR + * control: data struct containing parameters + * data: struct containing timing simulation data (including performance data) + * y: vector to which to apply preconditioning, + * specific to internals of iterative solver being used + * x (output): preconditioned vector + * fresh_pre: parameter indicating if this is a newly computed (fresh) preconditioner + * side: used in determining how to apply preconditioner if the preconditioner is + * factorized as M = M_{1}M_{2} (e.g., incomplete LU, A \approx LU) + * + * Assumptions: + * Matrices have non-zero diagonals + * Each row of a matrix has at least one non-zero (i.e., no rows with all zeros) */ +static void apply_preconditioner( reax_system const * const system, + storage * const workspace, control_params const * const control, + simulation_data * const data, mpi_datatypes * const mpi_data, + real const * const y, real * const x, int fresh_pre, int side ) +{ +// int i, si; + + /* no preconditioning */ + if ( control->cm_solver_pre_comp_type == NONE_PC ) + { + if ( x != y ) + { + Vector_Copy( x, y, system->n, control->streams[4] ); + } + } + else + { + switch ( side ) + { + case LEFT: + switch ( control->cm_solver_pre_app_type ) + { + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + jacobi_apply( workspace->d_workspace->Hdia_inv, + y, x, system->n, control->streams[4] ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve( workspace->L, y, x, workspace->L->n, LOWER ); +// break; + case SAI_PC: +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, H->NT, x ); +#else + Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, system->n, x ); +#endif + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + jacobi_apply( workspace->d_workspace->Hdia_inv, + y, x, system->n, control->streams[4] ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->L, y, x, workspace->L->n, LOWER, fresh_pre ); +// break; + case SAI_PC: +#if defined(NEUTRAL_TERRITORY) + Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, H->NT, x ); +#else + Sparse_MatVec( system, control, data, workspace, mpi_data, + &workspace->d_workspace->H_app_inv, + y, system->n, x ); +#endif + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// for ( i = 0; i < workspace->H->n; ++i ) +// { +// workspace->y_p[i] = y[i]; +// } +// +// permute_vector( workspace, workspace->y_p, workspace->H->n, FALSE, LOWER ); +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->L, workspace->y_p, x, workspace->L->n, LOWER, fresh_pre ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// // construct D^{-1}_L +// if ( fresh_pre == TRUE ) +// { +// for ( i = 0; i < workspace->L->n; ++i ) +// { +// si = workspace->L->start[i + 1] - 1; +// workspace->Dinv_L[i] = 1.0 / workspace->L->val[si]; +// } +// } +// +// jacobi_iter( workspace, workspace->L, workspace->Dinv_L, +// y, x, LOWER, control->cm_solver_pre_app_jacobi_iters ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + } + break; + + case RIGHT: + switch ( control->cm_solver_pre_app_type ) + { + case TRI_SOLVE_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + if ( x != y ) + { + Vector_Copy( x, y, system->n, control->streams[4] ); + } + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve( workspace->U, y, x, workspace->U->n, UPPER ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_LEVEL_SCHED_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + if ( x != y ) + { + Vector_Copy( x, y, system->n, control->streams[4] ); + } + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case TRI_SOLVE_GC_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// tri_solve_level_sched( (static_storage *) workspace, +// workspace->U, y, x, workspace->U->n, UPPER, fresh_pre ); +// permute_vector( workspace, x, workspace->H->n, TRUE, UPPER ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + case JACOBI_ITER_PA: + switch ( control->cm_solver_pre_comp_type ) + { + case JACOBI_PC: + case SAI_PC: + fprintf( stderr, "Unsupported preconditioner computation/application method combination. Terminating...\n" ); + exit( INVALID_INPUT ); + break; +// case ICHOLT_PC: +// case ILUT_PC: +// case ILUTP_PC: +// if ( fresh_pre == TRUE ) +// { +// for ( i = 0; i < workspace->U->n; ++i ) +// { +// si = workspace->U->start[i]; +// workspace->Dinv_U[i] = 1.0 / workspace->U->val[si]; +// } +// } +// +// jacobi_iter( workspace, workspace->U, workspace->Dinv_U, +// y, x, UPPER, control->cm_solver_pre_app_jacobi_iters ); +// break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + break; + default: + fprintf( stderr, "Unrecognized preconditioner application method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + } + break; + } + } +} + + int Cuda_dual_SDM( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, rvec2 const * const b, real tol, - rvec2 * const x, mpi_datatypes * const mpi_data ) + rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i, matvecs; int ret; @@ -968,8 +1513,10 @@ int Cuda_dual_SDM( reax_system const * const system, Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r2, workspace->d_workspace->q2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q2, workspace->d_workspace->d2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1040,8 +1587,10 @@ int Cuda_dual_SDM( reax_system const * const system, Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r2, workspace->d_workspace->q2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q2, workspace->d_workspace->d2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1056,7 +1605,7 @@ int Cuda_dual_SDM( reax_system const * const system, matvecs = Cuda_SDM( system, control, data, workspace, H, workspace->d_workspace->b_t, tol, - workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->t, 1, system->n, control->streams[4] ); @@ -1069,7 +1618,7 @@ int Cuda_dual_SDM( reax_system const * const system, matvecs = Cuda_SDM( system, control, data, workspace, H, workspace->d_workspace->b_s, tol, - workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->s, 0, system->n, control->streams[4] ); @@ -1095,7 +1644,7 @@ int Cuda_dual_SDM( reax_system const * const system, int Cuda_SDM( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, real const * const b, real tol, - real * const x, mpi_datatypes * const mpi_data ) + real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i; int ret; @@ -1119,8 +1668,10 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r, workspace->d_workspace->q, fresh_pre, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q, workspace->d_workspace->d, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1181,8 +1732,10 @@ int Cuda_SDM( reax_system const * const system, control_params const * const con Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r, workspace->d_workspace->q, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q, workspace->d_workspace->d, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1206,7 +1759,7 @@ int Cuda_dual_CG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, rvec2 const * const b, real tol, - rvec2 * const x, mpi_datatypes * const mpi_data ) + rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i, matvecs; int ret; @@ -1230,8 +1783,10 @@ int Cuda_dual_CG( reax_system const * const system, Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->d2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r2, workspace->d_workspace->q2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q2, workspace->d_workspace->d2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1301,8 +1856,10 @@ int Cuda_dual_CG( reax_system const * const system, Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->p2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r2, workspace->d_workspace->q2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q2, workspace->d_workspace->p2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1350,7 +1907,7 @@ int Cuda_dual_CG( reax_system const * const system, matvecs = Cuda_CG( system, control, data, workspace, H, workspace->d_workspace->b_t, tol, - workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->t, 1, system->n, control->streams[4] ); @@ -1363,7 +1920,7 @@ int Cuda_dual_CG( reax_system const * const system, matvecs = Cuda_CG( system, control, data, workspace, H, workspace->d_workspace->b_s, tol, - workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->s, 0, system->n, control->streams[4] ); @@ -1389,7 +1946,7 @@ int Cuda_dual_CG( reax_system const * const system, int Cuda_CG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, real const * const b, real tol, - real * const x, mpi_datatypes * const mpi_data ) + real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i; int ret; @@ -1414,8 +1971,10 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->d, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r, workspace->d_workspace->q, fresh_pre, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q, workspace->d_workspace->d, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1467,8 +2026,10 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->p, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r, workspace->d_workspace->q, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q, workspace->d_workspace->p, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1534,7 +2095,7 @@ int Cuda_CG( reax_system const * const system, control_params const * const cont int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, rvec2 const * const b, real tol, - rvec2 * const x, mpi_datatypes * const mpi_data ) + rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i, matvecs; int ret; @@ -1642,8 +2203,12 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p2, - workspace->d_workspace->d2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->p2, workspace->d_workspace->y2, + i == 0 ? fresh_pre : FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->y2, workspace->d_workspace->d2, + i == 0 ? fresh_pre : FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1706,8 +2271,10 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q2, - workspace->d_workspace->q_hat2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q2, workspace->d_workspace->y2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->y2, workspace->d_workspace->q_hat2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1787,7 +2354,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * matvecs = Cuda_BiCGStab( system, control, data, workspace, H, workspace->d_workspace->b_t, tol, - workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->t, 1, system->n, control->streams[4] ); @@ -1800,7 +2367,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * matvecs = Cuda_BiCGStab( system, control, data, workspace, H, workspace->d_workspace->b_s, tol, - workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->s, 0, system->n, control->streams[4] ); @@ -1841,7 +2408,7 @@ int Cuda_dual_BiCGStab( reax_system const * const system, control_params const * int Cuda_BiCGStab( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, real const * const b, real tol, - real * const x, mpi_datatypes * const mpi_data ) + real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i; int ret; @@ -1933,8 +2500,12 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->p, - workspace->d_workspace->d, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->p, workspace->d_workspace->y, + i == 0 ? fresh_pre : FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->y, workspace->d_workspace->d, + i == 0 ? fresh_pre : FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -1994,8 +2565,10 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->q, - workspace->d_workspace->q_hat, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->q, workspace->d_workspace->y, fresh_pre, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->y, workspace->d_workspace->q_hat, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2086,7 +2659,7 @@ int Cuda_BiCGStab( reax_system const * const system, control_params const * cons int Cuda_dual_PIPECG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, rvec2 const * const b, real tol, - rvec2 * const x, mpi_datatypes * const mpi_data ) + rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i, matvecs; int ret; @@ -2111,8 +2684,10 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r2, workspace->d_workspace->m2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->m2, workspace->d_workspace->u2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2141,8 +2716,10 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->w2, workspace->d_workspace->n2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n2, workspace->d_workspace->m2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2223,8 +2800,10 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->w2, workspace->d_workspace->n2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n2, workspace->d_workspace->m2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2262,7 +2841,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c matvecs = Cuda_PIPECG( system, control, data, workspace, H, workspace->d_workspace->b_t, tol, - workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->t, 1, system->n, control->streams[4] ); @@ -2275,7 +2854,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c matvecs = Cuda_PIPECG( system, control, data, workspace, H, workspace->d_workspace->b_s, tol, - workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->s, 0, system->n, control->streams[4] ); @@ -2309,7 +2888,7 @@ int Cuda_dual_PIPECG( reax_system const * const system, control_params const * c int Cuda_PIPECG( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, real const * const b, real tol, - real * const x, mpi_datatypes * const mpi_data ) + real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i; int ret; @@ -2334,8 +2913,10 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->u, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r, workspace->d_workspace->m, fresh_pre, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->m, workspace->d_workspace->u, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2364,8 +2945,10 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->w, workspace->d_workspace->n, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n, workspace->d_workspace->m, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2433,8 +3016,10 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const MPI_COMM_WORLD, &req ); Check_MPI_Error( ret, __FILE__, __LINE__ ); - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->w, workspace->d_workspace->n, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n, workspace->d_workspace->m, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2481,7 +3066,7 @@ int Cuda_PIPECG( reax_system const * const system, control_params const * const int Cuda_dual_PIPECR( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, rvec2 const * const b, real tol, - rvec2 * const x, mpi_datatypes * const mpi_data ) + rvec2 * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i, matvecs; int ret; @@ -2506,8 +3091,10 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r2, - workspace->d_workspace->u2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r2, workspace->d_workspace->n2, fresh_pre, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n2, workspace->d_workspace->u2, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2550,8 +3137,10 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c break; } - dual_jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w2, - workspace->d_workspace->m2, system->n, control->streams[4] ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->w2, workspace->d_workspace->n2, FALSE, LEFT ); + dual_apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n2, workspace->d_workspace->m2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2640,7 +3229,7 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c matvecs = Cuda_PIPECR( system, control, data, workspace, H, workspace->d_workspace->b_t, tol, - workspace->d_workspace->t, mpi_data ); + workspace->d_workspace->t, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->t, 1, system->n, control->streams[4] ); @@ -2653,7 +3242,7 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c matvecs = Cuda_PIPECR( system, control, data, workspace, H, workspace->d_workspace->b_s, tol, - workspace->d_workspace->s, mpi_data ); + workspace->d_workspace->s, mpi_data, FALSE ); Vector_Copy_To_rvec2( workspace->d_workspace->x, workspace->d_workspace->s, 0, system->n, control->streams[4] ); @@ -2684,7 +3273,7 @@ int Cuda_dual_PIPECR( reax_system const * const system, control_params const * c int Cuda_PIPECR( reax_system const * const system, control_params const * const control, simulation_data * const data, storage * const workspace, sparse_matrix const * const H, real const * const b, real tol, - real * const x, mpi_datatypes * const mpi_data ) + real * const x, mpi_datatypes * const mpi_data, int fresh_pre ) { unsigned int i; int ret; @@ -2709,8 +3298,10 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const Update_Timing_Info( &time, &data->timing.cm_solver_vector_ops ); #endif - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->r, - workspace->d_workspace->u, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->r, workspace->d_workspace->n, fresh_pre, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n, workspace->d_workspace->u, fresh_pre, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); @@ -2746,8 +3337,10 @@ int Cuda_PIPECR( reax_system const * const system, control_params const * const for ( i = 0; i < control->cm_solver_max_iters && r_norm / b_norm > tol; ++i ) { - jacobi_apply( workspace->d_workspace->Hdia_inv, workspace->d_workspace->w, - workspace->d_workspace->m, system->n, control->streams[4] ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->w, workspace->d_workspace->n, FALSE, LEFT ); + apply_preconditioner( system, workspace, control, data, mpi_data, + workspace->d_workspace->n, workspace->d_workspace->m, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) Update_Timing_Info( &time, &data->timing.cm_solver_pre_app ); diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h index 281bbc18b63864233ecb9d5326beaaae503844ca..c82634a9e49e88a7a03f34161818c3dd02bee199 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.h @@ -28,52 +28,52 @@ int Cuda_dual_SDM( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, rvec2 const * const, real, - rvec2 * const, mpi_datatypes * const ); + rvec2 * const, mpi_datatypes * const, int ); int Cuda_SDM( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, real const * const, real, - real * const, mpi_datatypes * const ); + real * const, mpi_datatypes * const, int ); int Cuda_dual_CG( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, rvec2 const * const, real, - rvec2 * const, mpi_datatypes * const ); + rvec2 * const, mpi_datatypes * const, int ); int Cuda_CG( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, real const * const, real, - real * const, mpi_datatypes * const ); + real * const, mpi_datatypes * const, int ); int Cuda_dual_BiCGStab( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, rvec2 const * const, real, - rvec2 * const, mpi_datatypes * const ); + rvec2 * const, mpi_datatypes * const, int ); int Cuda_BiCGStab( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, real const * const, real, - real * const, mpi_datatypes * const ); + real * const, mpi_datatypes * const, int ); int Cuda_dual_PIPECG( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, rvec2 const * const, real, - rvec2 * const, mpi_datatypes * const ); + rvec2 * const, mpi_datatypes * const, int ); int Cuda_PIPECG( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, real const * const, real, - real * const, mpi_datatypes * const ); + real * const, mpi_datatypes * const, int ); int Cuda_dual_PIPECR( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, rvec2 const * const, real, - rvec2 * const, mpi_datatypes * const ); + rvec2 * const, mpi_datatypes * const, int ); int Cuda_PIPECR( reax_system const * const, control_params const * const, simulation_data * const, storage * const, sparse_matrix const * const, real const * const, real, - real * const, mpi_datatypes * const ); + real * const, mpi_datatypes * const, int ); #endif diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index 35b3ef00f4c80c54df81397bf3f61989b1fd83f4..e2ef102665422f7d7c77d21a6cb4cda1fb216723 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -3703,9 +3703,9 @@ int dual_PIPECG( reax_system const * const system, control_params const * const Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, - workspace->n2, fresh_pre, LEFT ); + workspace->n2, FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, - workspace->m2, fresh_pre, RIGHT ); + workspace->m2, FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, @@ -3789,9 +3789,9 @@ int dual_PIPECG( reax_system const * const system, control_params const * const Check_MPI_Error( ret, __FILE__, __LINE__ ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, - workspace->n2, fresh_pre, LEFT ); + workspace->n2, FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, - workspace->m2, fresh_pre, RIGHT ); + workspace->m2, FALSE, RIGHT ); #if defined(NEUTRAL_TERRITORY) Dual_Sparse_MatVec( system, control, data, mpi_data, H, workspace->m2, @@ -4112,9 +4112,9 @@ int dual_PIPECR( reax_system const * const system, control_params const * const } dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->w2, - workspace->n2, fresh_pre, LEFT ); + workspace->n2, FALSE, LEFT ); dual_apply_preconditioner( system, workspace, control, data, mpi_data, workspace->n2, - workspace->m2, fresh_pre, RIGHT ); + workspace->m2, FALSE, RIGHT ); #if defined(LOG_PERFORMANCE) time = Get_Time( ); diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index dfa540b48b34cb38dd2abb0913686423edb7f06c..e1e175a2ca9ac9cc56973ec1f837b10f424181f5 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -360,10 +360,11 @@ /* max. num. of active CUDA streams */ #define MAX_CUDA_STREAMS (5) - /* max. num. of CUDA events used for synchronizing streams */ - #define MAX_CUDA_STREAM_EVENTS (4) #endif +/* max. num. of CUDA events used for synchronizing streams */ +#define MAX_CUDA_STREAM_EVENTS (4) + /* ensemble type */ enum ensemble