From 688fdb8f625621f7b2695124e9aae1e841a57deb Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Sat, 23 Sep 2017 17:16:25 -0400 Subject: [PATCH] PG-PuReMD: continue merging EE/ACKS2 code from sPuReMD. Remove unused arrays. Other clean-up. --- PG-PuReMD/Makefile.am | 9 +- PG-PuReMD/configure.ac | 6 +- PG-PuReMD/src/allocate.c | 2 - PG-PuReMD/src/cuda/cuda_allocate.cu | 127 +++++++++------ PG-PuReMD/src/cuda/cuda_charges.cu | 148 +++++++++++++++--- PG-PuReMD/src/cuda/cuda_charges.h | 2 +- PG-PuReMD/src/cuda/cuda_forces.cu | 6 +- PG-PuReMD/src/cuda/cuda_init_md.cu | 4 +- PG-PuReMD/src/cuda/cuda_lin_alg.cu | 222 ++------------------------- PG-PuReMD/src/cuda/cuda_reduction.cu | 37 +++-- PG-PuReMD/src/forces.c | 11 +- PG-PuReMD/src/lin_alg.c | 6 +- PG-PuReMD/src/parallelreax.c | 5 +- PG-PuReMD/src/reax_types.h | 2 - environ/parallel_control | 2 +- 15 files changed, 279 insertions(+), 310 deletions(-) diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am index d16fd249..6c23c15d 100644 --- a/PG-PuReMD/Makefile.am +++ b/PG-PuReMD/Makefile.am @@ -42,7 +42,7 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \ src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \ src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \ - src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu + src/cuda/cuda_init_md.cu src/cuda/cuda_lookup.cu include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \ src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \ src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \ @@ -52,7 +52,12 @@ include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \ src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \ src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \ src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \ - src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h + src/cuda/cuda_init_md.h src/cuda/cuda_lookup.h + +if DEBUG +bin_pg_puremd_SOURCES += src/cuda/cuda_validation.cu +include_HEADERS += src/cuda/cuda_validation.h +endif # dummy source to cause C linking nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac index 6e9f3fb5..5a5ac9e1 100644 --- a/PG-PuReMD/configure.ac +++ b/PG-PuReMD/configure.ac @@ -133,18 +133,20 @@ if test "x$BUILD_GPU" = "xyes"; then NVCCFLAGS= NVCCFLAGS+=" -ccbin=$CXX" - if test "BUILD_DEBUG" = "true" + if test "x${BUILD_DEBUG}" = "xtrue" then NVCCFLAGS+=" -g -G" + fi AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.]) else AM_CONDITIONAL(USE_CUDA, test "x" = "xyes") fi +AM_CONDITIONAL(DEBUG, test "x${BUILD_DEBUG}" = "xtrue") if test "BUILD_PROF" = "true" then - if test "x$BUILD_GPU" = "xyes"; then + if test "x${BUILD_GPU}" = "xyes"; then NVCCFLAGS+=" --compiler-options ${gprof_flags}" else CFLAGS+=" ${gprof_flags}" diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c index 039e65d2..c2de4893 100644 --- a/PG-PuReMD/src/allocate.c +++ b/PG-PuReMD/src/allocate.c @@ -117,7 +117,6 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->Clp, "Clp" ); sfree( workspace->vlpex, "vlpex" ); sfree( workspace->bond_mark, "bond_mark" ); - sfree( workspace->done_after, "done_after" ); /* QEq storage */ sfree( workspace->Hdia_inv, "Hdia_inv" ); @@ -245,7 +244,6 @@ void Allocate_Workspace( reax_system *system, control_params *control, workspace->Clp = (real*) smalloc( total_real, "Clp" ); workspace->vlpex = (real*) smalloc( total_real, "vlpex" ); workspace->bond_mark = (int*) scalloc(total_cap, sizeof(int), "bond_mark"); - workspace->done_after = (int*) scalloc(total_cap, sizeof(int), "done_after"); /* charge method storage */ switch ( control->charge_method ) diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index 5df47e6f..d683bba5 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -358,7 +358,6 @@ void dev_alloc_workspace( reax_system *system, control_params *control, */ /* bond order related storage */ - cuda_malloc( (void **) &workspace->within_bond_box, total_cap * sizeof (int), TRUE, "skin" ); cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" ); cuda_malloc( (void **) &workspace->Deltap, total_real, TRUE, "Deltap" ); cuda_malloc( (void **) &workspace->Deltap_boc, total_real, TRUE, "Deltap_boc" ); @@ -375,39 +374,58 @@ void dev_alloc_workspace( reax_system *system, control_params *control, cuda_malloc( (void **) &workspace->Clp, total_real, TRUE, "Clp" ); cuda_malloc( (void **) &workspace->vlpex, total_real, TRUE, "vlpex" ); cuda_malloc( (void **) &workspace->bond_mark, total_real, TRUE, "bond_mark" ); - cuda_malloc( (void **) &workspace->done_after, total_real, TRUE, "done_after" ); - /* charge matrix storage */ - cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" ); + if ( control->cm_solver_pre_comp_type == DIAG_PC ) + { + cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" ); + } cuda_malloc( (void **) &workspace->b_s, total_cap * sizeof(real), TRUE, "b_s" ); cuda_malloc( (void **) &workspace->b_t, total_cap * sizeof(real), TRUE, "b_t" ); cuda_malloc( (void **) &workspace->b_prc, total_cap * sizeof(real), TRUE, "b_prc" ); cuda_malloc( (void **) &workspace->b_prm, total_cap * sizeof(real), TRUE, "b_prm" ); cuda_malloc( (void **) &workspace->s, total_cap * sizeof(real), TRUE, "s" ); cuda_malloc( (void **) &workspace->t, total_cap * sizeof(real), TRUE, "t" ); - cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" ); + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" ); + } cuda_malloc( (void **) &workspace->b, total_cap * sizeof(rvec2), TRUE, "b" ); cuda_malloc( (void **) &workspace->x, total_cap * sizeof(rvec2), TRUE, "x" ); - /* GMRES storage */ - cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" ); - cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" ); - cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" ); - cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" ); - cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" ); - cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" ); - cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" ); - - /* CG storage */ - cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" ); - cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" ); - cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" ); - cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" ); - cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" ); - cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" ); - cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" ); - cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" ); + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" ); + cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" ); + cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" ); + cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" ); + cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" ); + cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" ); + cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" ); + break; + + case SDM_S: + break; + + case CG_S: + cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" ); + cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" ); + cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" ); + cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" ); + cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" ); + cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" ); + cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" ); + cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" ); + break; + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } /* integrator storage */ cuda_malloc( (void **) &workspace->v_const, local_rvec, TRUE, "v_const" ); @@ -458,7 +476,6 @@ void dev_dealloc_workspace( control_params *control, storage *workspace ) */ /* bond order related storage */ - cuda_free( workspace->within_bond_box, "skin" ); cuda_free( workspace->total_bond_order, "total_bo" ); cuda_free( workspace->Deltap, "Deltap" ); cuda_free( workspace->Deltap_boc, "Deltap_boc" ); @@ -475,38 +492,58 @@ void dev_dealloc_workspace( control_params *control, storage *workspace ) cuda_free( workspace->Clp, "Clp" ); cuda_free( workspace->vlpex, "vlpex" ); cuda_free( workspace->bond_mark, "bond_mark" ); - cuda_free( workspace->done_after, "done_after" ); /* charge matrix storage */ - cuda_free( workspace->Hdia_inv, "Hdia_inv" ); + if ( control->cm_solver_pre_comp_type == DIAG_PC ) + { + cuda_free( workspace->Hdia_inv, "Hdia_inv" ); + } cuda_free( workspace->b_s, "b_s" ); cuda_free( workspace->b_t, "b_t" ); cuda_free( workspace->b_prc, "b_prc" ); cuda_free( workspace->b_prm, "b_prm" ); cuda_free( workspace->s, "s" ); cuda_free( workspace->t, "t" ); - cuda_free( workspace->droptol, "droptol" ); + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + cuda_free( workspace->droptol, "droptol" ); + } cuda_free( workspace->b, "b" ); cuda_free( workspace->x, "x" ); - /* GMRES storage */ - cuda_free( workspace->y, "y" ); - cuda_free( workspace->z, "z" ); - cuda_free( workspace->g, "g" ); - cuda_free( workspace->h, "h" ); - cuda_free( workspace->hs, "hs" ); - cuda_free( workspace->hc, "hc" ); - cuda_free( workspace->v, "v" ); - - /* CG storage */ - cuda_free( workspace->r, "r" ); - cuda_free( workspace->d, "d" ); - cuda_free( workspace->q, "q" ); - cuda_free( workspace->p, "p" ); - cuda_free( workspace->r2, "r2" ); - cuda_free( workspace->d2, "d2" ); - cuda_free( workspace->q2, "q2" ); - cuda_free( workspace->p2, "p2" ); + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + cuda_free( workspace->y, "y" ); + cuda_free( workspace->z, "z" ); + cuda_free( workspace->g, "g" ); + cuda_free( workspace->h, "h" ); + cuda_free( workspace->hs, "hs" ); + cuda_free( workspace->hc, "hc" ); + cuda_free( workspace->v, "v" ); + break; + + case SDM_S: + break; + + case CG_S: + cuda_free( workspace->r, "r" ); + cuda_free( workspace->d, "d" ); + cuda_free( workspace->q, "q" ); + cuda_free( workspace->p, "p" ); + cuda_free( workspace->r2, "r2" ); + cuda_free( workspace->d2, "d2" ); + cuda_free( workspace->q2, "q2" ); + cuda_free( workspace->p2, "p2" ); + break; + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } /* integrator storage */ cuda_free( workspace->v_const, "v_const" ); diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 8452548d..c22058e9 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -24,7 +24,9 @@ #include "cuda_lin_alg.h" #include "cuda_reduction.h" #include "cuda_utils.h" -#include "cuda_validation.h" +#if defined(DEBUG) + #include "cuda_validation.h" +#endif #include "../basic_comm.h" @@ -259,32 +261,114 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); //} -//#ifdef __CUDA_DEBUG__ -// Init_MatVec( system, data, control, workspace, mpi_data ); -//#endif + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + case SDM_S: + fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + case CG_S: + s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H, + dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data, + out_control->log, data ); + t_matvecs = 0; + break; + + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } -#if defined(DEBUG) - fprintf( stderr, "p%d: initialized qEq\n", system->my_rank ); - //Print_Linear_System( system, control, workspace, data->step ); + Cuda_Calculate_Charges( system, workspace, mpi_data ); + +#if defined(LOG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + data->timing.s_matvecs += s_matvecs; + data->timing.t_matvecs += t_matvecs; + } #endif +} - //MATRIX CHANGES - s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H, - dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data, - out_control->log, data ); - t_matvecs = 0; - //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs); -#if defined(DEBUG) - fprintf( stderr, "p%d: first CG completed\n", system->my_rank ); -#endif +void Cuda_EE( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + int s_matvecs, t_matvecs; + + Cuda_Init_MatVec( system, workspace ); + + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + case SDM_S: + fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + case CG_S: + s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H, + dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data ); + t_matvecs = 0; + break; + + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } Cuda_Calculate_Charges( system, workspace, mpi_data ); -#if defined(DEBUG) - fprintf( stderr, "p%d: computed charges\n", system->my_rank ); - //Print_Charges( system ); +#if defined(LOG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + data->timing.s_matvecs += s_matvecs; + data->timing.t_matvecs += t_matvecs; + } #endif +} + + +void Cuda_ACKS2( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + int s_matvecs, t_matvecs; + + Cuda_Init_MatVec( system, workspace ); + + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + case SDM_S: + fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + case CG_S: + s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H, + dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data ); + t_matvecs = 0; + break; + + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + + Cuda_Calculate_Charges( system, workspace, mpi_data ); #if defined(LOG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) @@ -294,3 +378,29 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data } #endif } + + +void Cuda_Compute_Charges( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + switch ( control->charge_method ) + { + case QEQ_CM: + Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); + break; + + case EE_CM: + Cuda_EE( system, control, data, workspace, out_control, mpi_data ); + break; + + case ACKS2_CM: + Cuda_ACKS2( system, control, data, workspace, out_control, mpi_data ); + break; + + default: + fprintf( stderr, "Invalid charge method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } +} diff --git a/PG-PuReMD/src/cuda/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h index d1922a48..dc8c16b6 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.h +++ b/PG-PuReMD/src/cuda/cuda_charges.h @@ -37,7 +37,7 @@ void cuda_charges_st( reax_system *, storage *, real *, real ); void cuda_charges_updateq( reax_system *, real * ); -void Cuda_QEq( reax_system*, control_params*, simulation_data*, +void Cuda_Compute_Charges( reax_system*, control_params*, simulation_data*, storage*, output_controls*, mpi_datatypes* ); diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 28982bff..bc86bf41 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -15,7 +15,9 @@ #include "cuda_torsion_angles.h" #include "cuda_utils.h" #include "cuda_valence_angles.h" -#include "cuda_validation.h" +#if defined(DEBUG) + #include "cuda_validation.h" +#endif #include "../basic_comm.h" #include "../forces.h" @@ -1849,7 +1851,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, #if defined(PURE_REAX) if ( charge_flag == TRUE ) { - Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); + Cuda_Compute_Charges( system, control, data, workspace, out_control, mpi_data ); } #if defined(LOG_PERFORMANCE) diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index b115f287..81b9b3e1 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -10,7 +10,9 @@ #include "cuda_reset_tools.h" #include "cuda_system_props.h" #include "cuda_utils.h" -#include "cuda_validation.h" +#if defined(DEBUG) + #include "cuda_validation.h" +#endif #if defined(PURE_REAX) #include "../box.h" diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu index bb35d181..1b51216a 100644 --- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu @@ -336,7 +336,7 @@ void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count ) } -CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) +CUDA_GLOBAL void k_diagonal_preconditioner( storage p_workspace, rvec2 *b, int n ) { storage *workspace; int j; @@ -350,15 +350,13 @@ CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) workspace = &( p_workspace ); - //for( j = 0; j < system->n; ++j ) { - // residual + /* compute residuals */ workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; - // apply diagonal pre-conditioner + /* apply diagonal preconditioner to residuals */ workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; - //} } @@ -370,7 +368,7 @@ void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n ) (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> - (*workspace, b, n); + ( *workspace, b, n ); cudaThreadSynchronize( ); cudaCheckError( ); @@ -396,7 +394,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, alpha[1] = alpha_1; my_dot[j][0] = my_dot[j][1] = 0.0; - //for( j = 0; j < system->n; ++j ) { // update x x[j][0] += alpha[0] * workspace->d2[j][0]; x[j][1] += alpha[1] * workspace->d2[j][1]; @@ -412,7 +409,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, // dot product: r.p my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; - //} } @@ -433,7 +429,7 @@ void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha, cudaThreadSynchronize( ); cudaCheckError( ); - //Reduction to calculate my_dot + /* reduction to calculate my_dot */ k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> ( tmp, tmp + n, n); @@ -515,9 +511,11 @@ void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; - if (i >= n) + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) { return; } @@ -542,9 +540,11 @@ void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n ) CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) + if ( i >= n ) { return; } @@ -651,46 +651,14 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa } #endif - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer ); -//#endif - -// check_zeros_device( x, system->N, "x" ); - copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" ); Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" ); -// check_zeros_device( x, system->N, "x" ); - -// compare_rvec2 (workspace->x, x, N, "x"); -// if (data->step > 0) { -// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b"); -// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x"); -// -// exit (0); -// } - - -//#ifdef __CUDA_DEBUG__ -// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N ); -//#endif //originally we were using only H->n which was system->n (init_md.c) //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap); - Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap ); -// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - -// if (data->step > 0) exit (0); - - // tryQEq - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); -//#endif - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:q2:get" ); Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); @@ -704,63 +672,23 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa } #endif -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // residual -// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0]; -// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1]; -// // apply diagonal pre-conditioner -// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; -// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; -// } -//#endif - Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n ); -// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2"); -// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2"); - - /* norm of b */ -//#ifdef __CUDA_DEBUG__ -// my_sum[0] = my_sum[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_sum[0] += SQR( workspace->b[j][0] ); -// my_sum[1] += SQR( workspace->b[j][1] ); -// } -// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); -//#endif - my_sum[0] = 0; my_sum[1] = 0; Cuda_Norm( b, n, my_sum ); -// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); - MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); b_norm[0] = SQRT( norm_sqr[0] ); b_norm[1] = SQRT( norm_sqr[1] ); - //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] ); /* dot product: r.d */ -//#ifdef __CUDA_DEBUG__ -// my_dot[0] = my_dot[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0]; -// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1]; -// } -// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - my_dot[0] = 0; my_dot[1] = 0; Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n ); - -// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -770,53 +698,21 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa for ( i = 1; i < control->cm_solver_max_iters; ++i ) { - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer); -//#endif - copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "cg:d2:get" ); Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "cg:d2:put" ); - //print_device_rvec2 (dev_workspace->d2, N); - -//#ifdef __CUDA_DEBUG__ -// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N ); -//#endif - Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, system->total_cap ); - /* - fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n ); - fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n"); - print_sparse_matrix_host (&workspace->H); - fprintf (stderr, " ******* HOST Vector ***************\n"); - print_host_rvec2 (workspace->d2, system->N); - fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n"); - print_sparse_matrix (&dev_workspace->H); - fprintf (stderr, " ******* Device Vector ***************\n"); - print_device_rvec2 (dev_workspace->d2, system->N); - */ - //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - - // tryQEq - // MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); -//#endif - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "cg:q2:get" ); Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "cg:q2:put" ); -// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -825,61 +721,23 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa #endif /* dot product: d.q */ -//#ifdef __CUDA_DEBUG__ -// my_dot[0] = my_dot[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0]; -// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1]; -// } -// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = my_dot[1] = 0; + my_dot[0] = 0; + my_dot[1] = 0; Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n); - //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] ); alpha[0] = sig_new[0] / tmp[0]; alpha[1] = sig_new[1] / tmp[1]; my_dot[0] = 0; my_dot[1] = 0; -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // update x -// workspace->x[j][0] += alpha[0] * workspace->d2[j][0]; -// workspace->x[j][1] += alpha[1] * workspace->d2[j][1]; -// // update residual -// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; -// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; -// // apply diagonal pre-conditioner -// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; -// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; -// // dot product: r.p -// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0]; -// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1]; -// } -// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = 0; - my_dot[1] = 0; Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot ); - //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); - -// compare_rvec2 (workspace->x, dev_workspace->x, N, "x"); -// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2"); -// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2"); - sig_old[0] = sig_new[0]; sig_old[1] = sig_new[1]; MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -895,60 +753,26 @@ int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspa beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // d = p + beta * d -// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0]; -// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1]; -// } -//#endif - Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta, dev_workspace->d2, system->n ); - -// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2"); } - if ( SQRT(sig_new[0]) / b_norm[0] <= tol ) { - //for( j = 0; j < n; ++j ) - // workspace->t[j] = workspace->x[j][1]; - //fprintf (stderr, "Getting started with Cuda_CG1 \n"); - Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n ); - //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); - //compare_array (workspace->t, dev_workspace->t, system->n, "t"); - matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_t, tol, dev_workspace->t, mpi_data ); - //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs ); - //for( j = 0; j < n; ++j ) - // workspace->x[j][1] = workspace->t[j]; - Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n ); } else if ( SQRT(sig_new[1]) / b_norm[1] <= tol ) { - //for( j = 0; j < n; ++j ) - // workspace->s[j] = workspace->x[j][0]; - Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n ); - //compare_array (workspace->s, dev_workspace->s, system->n, "s"); - //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); - - //fprintf (stderr, "Getting started with Cuda_CG2 \n"); - matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_s, tol, dev_workspace->s, mpi_data ); - //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs ); - //for( j = 0; j < system->n; ++j ) - // workspace->x[j][0] = workspace->s[j]; - Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n ); } @@ -983,25 +807,19 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace, scale = sizeof(real) / sizeof(void); - /* x is on the device */ - //MVAPICH2 memset( spad, 0, sizeof(real) * system->total_cap ); copy_host_device( spad, x, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:x:get" ); Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); - //MVAPICH2 copy_host_device( spad, x, sizeof(real) * system->total_cap, cudaMemcpyHostToDevice, "cuda_cg:x:put" ); Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap ); - // tryQEq - // MVAPICH2 copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); - //MVAPICH2 copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, cudaMemcpyHostToDevice, "cuda_cg:q:put" ); @@ -1014,8 +832,7 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace, Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q, system->n ); - //for( j = 0; j < system->n; ++j ) - // workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r, dev_workspace->Hdia_inv, system->n ); @@ -1041,7 +858,6 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace, for ( i = 1; i < control->cm_solver_max_iters && SQRT(sig_new) / b_norm > tol; ++i ) { - //MVAPICH2 copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); @@ -1050,7 +866,6 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace, Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap ); - //tryQEq copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); @@ -1078,9 +893,7 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace, //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n ); Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0, dev_workspace->r, system->n ); - /* pre-conditioning */ - //for( j = 0; j < system->n; ++j ) - // workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r, dev_workspace->Hdia_inv, system->n ); @@ -1092,7 +905,6 @@ int Cuda_CG( reax_system *system, control_params *control, storage *workspace, copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n, cudaMemcpyDeviceToHost, "cuda_cg:p:get" ); sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world ); - //fprintf (stderr, "Device: sig_new: %f \n", sig_new ); beta = sig_new / sig_old; Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu index 01bd3c81..49224a06 100644 --- a/PG-PuReMD/src/cuda/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda/cuda_reduction.cu @@ -662,8 +662,8 @@ CUDA_GLOBAL void k_norm_rvec2( const rvec2 *input, rvec2 *per_block_results, } -CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res, - const size_t n) +CUDA_GLOBAL void k_dot_rvec2( const rvec2 *a, rvec2 *b, rvec2 *res, + const size_t n ) { #if defined(__SM_35__) extern __shared__ rvec2 my_dot2[]; @@ -746,11 +746,13 @@ CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res, //vector functions ////////////////////////////////////////////////// CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, - int k ) + int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if( i >= k ) + if ( i >= n ) { return; } @@ -759,11 +761,13 @@ CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, } -CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k ) +CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if( i >= k ) + if ( i >= n ) { return; } @@ -772,11 +776,13 @@ CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k ) } -CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k ) +CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; - if( i >= k ) + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) { return; } @@ -787,12 +793,13 @@ CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k ) CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *dest, rvec2 *a, - real beta0, real beta1, - rvec2 *b, int n ) + real beta0, real beta1, rvec2 *b, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if( i >= n ) + if ( i >= n ) { return; } diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c index d3e295cf..91fbfd17 100644 --- a/PG-PuReMD/src/forces.c +++ b/PG-PuReMD/src/forces.c @@ -800,7 +800,6 @@ int Init_Forces( reax_system *system, control_params *control, { /* put ghost atoms to an infinite distance */ workspace->bond_mark[i] = 1000; - //workspace->done_after[i] = Start_Index( i, far_nbrs ); } H = &(workspace->H); //MATRIX CHANGES @@ -969,8 +968,6 @@ int Init_Forces( reax_system *system, control_params *control, else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) { workspace->bond_mark[i] = workspace->bond_mark[j] + 1; - //if( workspace->bond_mark[i] == 1000 ) - // workspace->done_after[i] = pj; } //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n", @@ -1005,7 +1002,6 @@ int Init_Forces( reax_system *system, control_params *control, /*for( i = system->n; i < system->N; ++i ) { start_i = Start_Index(i, far_nbrs); - end_i = workspace->done_after[i]; if( workspace->bond_mark[i] >= 2 && start_i < end_i ) { atom_i = &(system->my_atoms[i]); @@ -1088,7 +1084,6 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control, for ( i = system->n; i < system->N; ++i ) { workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance - //workspace->done_after[i] = Start_Index( i, far_nbrs ); } num_bonds = 0; @@ -1214,15 +1209,13 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control, ++btop_i; if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 ) + { workspace->bond_mark[j] = workspace->bond_mark[i] + 1; + } else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) { workspace->bond_mark[i] = workspace->bond_mark[j] + 1; - //if( workspace->bond_mark[i] == 1000 ) - // workspace->done_after[i] = pj; } - //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n", - // i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] ); } } } diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index 15d8ad96..5de64a8a 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -28,7 +28,7 @@ #include "tool_box.h" #include "vector.h" -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) && defined(DEBUG) #include "cuda/cuda_validation.h" #endif @@ -98,13 +98,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 } #endif -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) && defined(DEBUG) check_zeros_host( x, N, "x" ); #endif Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) && defined(DEBUG) check_zeros_host( x, N, "x" ); #endif diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index 9ace1c32..da907412 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -49,9 +49,12 @@ #include "cuda/cuda_reset_tools.h" #include "cuda/cuda_system_props.h" #include "cuda/cuda_utils.h" - #include "cuda/cuda_validation.h" + #if defined(DEBUG) + #include "cuda/cuda_validation.h" + #endif #endif + evolve_function Evolve; evolve_function Cuda_Evolve; LR_lookup_table *LR; diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index 192cecbe..80a4dafc 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -1919,8 +1919,6 @@ typedef struct rvec *dDeltap_self; /**/ int *bond_mark; - /**/ - int *done_after; /* charge matrix storage */ /* charge matrix */ diff --git a/environ/parallel_control b/environ/parallel_control index cfe9fa5a..4f6d75b2 100644 --- a/environ/parallel_control +++ b/environ/parallel_control @@ -21,7 +21,7 @@ hbond_cutoff 7.50 ! cutoff distance for hydrogen b charge_method 0 ! charge method: 0 = QEq, 1 = EEM, 2 = ACKS2 charge_freq 1 ! frequency (sim step) at which atomic charges are computed cm_q_net 0.0 ! net system charge -cm_solver_type 0 ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM +cm_solver_type 2 ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM cm_solver_max_iters 1000 ! max solver iterations cm_solver_restart 100 ! inner iterations of GMRES before restarting cm_solver_q_err 1e-6 ! relative residual norm threshold used in solver -- GitLab