diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am index d16fd2499993a8a324e414f265d14da31057eea0..6c23c15d9ae6af2cb62f9621acea4c6f23b914f8 100644 --- a/PG-PuReMD/Makefile.am +++ b/PG-PuReMD/Makefile.am @@ -42,7 +42,7 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \ src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \ src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \ - src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu + src/cuda/cuda_init_md.cu src/cuda/cuda_lookup.cu include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \ src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \ src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h src/cuda/cuda_list.h \ @@ -52,7 +52,12 @@ include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \ src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \ src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \ src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \ - src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h + src/cuda/cuda_init_md.h src/cuda/cuda_lookup.h + +if DEBUG +bin_pg_puremd_SOURCES += src/cuda/cuda_validation.cu +include_HEADERS += src/cuda/cuda_validation.h +endif # dummy source to cause C linking nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac index 6e9f3fb5b91f9727f5762fc23bf9d2d400a10ee4..5a5ac9e11e3d9d2849bf9e39b83f99de7bd7db0d 100644 --- a/PG-PuReMD/configure.ac +++ b/PG-PuReMD/configure.ac @@ -133,18 +133,20 @@ if test "x$BUILD_GPU" = "xyes"; then NVCCFLAGS= NVCCFLAGS+=" -ccbin=$CXX" - if test "BUILD_DEBUG" = "true" + if test "x${BUILD_DEBUG}" = "xtrue" then NVCCFLAGS+=" -g -G" + fi AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.]) else AM_CONDITIONAL(USE_CUDA, test "x" = "xyes") fi +AM_CONDITIONAL(DEBUG, test "x${BUILD_DEBUG}" = "xtrue") if test "BUILD_PROF" = "true" then - if test "x$BUILD_GPU" = "xyes"; then + if test "x${BUILD_GPU}" = "xyes"; then NVCCFLAGS+=" --compiler-options ${gprof_flags}" else CFLAGS+=" ${gprof_flags}" diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c index 54614694e37590dce079f9edb6c47a150c7baebe..c2de48939bf36905db729e0d3abc36c6460eb7d9 100644 --- a/PG-PuReMD/src/allocate.c +++ b/PG-PuReMD/src/allocate.c @@ -89,7 +89,7 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) return; } - workspace->allocated = 0; + workspace->allocated = FALSE; /* communication storage */ for ( i = 0; i < MAX_NBRS; ++i ) @@ -117,7 +117,6 @@ void DeAllocate_Workspace( control_params *control, storage *workspace ) sfree( workspace->Clp, "Clp" ); sfree( workspace->vlpex, "vlpex" ); sfree( workspace->bond_mark, "bond_mark" ); - sfree( workspace->done_after, "done_after" ); /* QEq storage */ sfree( workspace->Hdia_inv, "Hdia_inv" ); @@ -213,7 +212,7 @@ void Allocate_Workspace( reax_system *system, control_params *control, { int i, total_real, total_rvec, local_rvec; - workspace->allocated = 1; + workspace->allocated = TRUE; total_real = total_cap * sizeof(real); total_rvec = total_cap * sizeof(rvec); local_rvec = local_cap * sizeof(rvec); @@ -245,9 +244,25 @@ void Allocate_Workspace( reax_system *system, control_params *control, workspace->Clp = (real*) smalloc( total_real, "Clp" ); workspace->vlpex = (real*) smalloc( total_real, "vlpex" ); workspace->bond_mark = (int*) scalloc(total_cap, sizeof(int), "bond_mark"); - workspace->done_after = (int*) scalloc(total_cap, sizeof(int), "done_after"); - /* QEq storage */ + /* charge method storage */ + switch ( control->charge_method ) + { + case QEQ_CM: + system->N_cm = system->N; + break; + case EE_CM: + system->N_cm = system->N + 1; + break; + case ACKS2_CM: + system->N_cm = 2 * system->N + 2; + break; + default: + fprintf( stderr, "[ERROR] Unknown charge method type. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + workspace->Hdia_inv = (real*) scalloc( total_cap, sizeof(real), "Hdia_inv" ); workspace->b_s = (real*) scalloc( total_cap, sizeof(real), "b_s" ); workspace->b_t = (real*) scalloc( total_cap, sizeof(real), "b_t" ); @@ -255,39 +270,56 @@ void Allocate_Workspace( reax_system *system, control_params *control, workspace->b_prm = (real*) scalloc( total_cap, sizeof(real), "b_prm" ); workspace->s = (real*) scalloc( total_cap, sizeof(real), "s" ); workspace->t = (real*) scalloc( total_cap, sizeof(real), "t" ); - workspace->droptol = (real*) scalloc( total_cap, sizeof(real), "droptol" ); + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + workspace->droptol = (real*) scalloc( total_cap, sizeof(real), "droptol" ); + } workspace->b = (rvec2*) scalloc( total_cap, sizeof(rvec2), "b" ); workspace->x = (rvec2*) scalloc( total_cap, sizeof(rvec2), "x" ); - /* GMRES storage */ - workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y" ); - workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z" ); - workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g" ); - //SUHDIR - //workspace->h = (real**) scalloc( RESTART+1, sizeof(real*), "h" ); - workspace->h = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "h"); - workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs" ); - workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc" ); - //SUDHIR - //workspace->v = (real**) scalloc( RESTART+1, sizeof(real*), "v" ); - workspace->v = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "v"); - - /* - for( i = 0; i < RESTART+1; ++i ) { - workspace->h[i] = (real*) scalloc( RESTART+1, sizeof(real), "h[i]" ); - workspace->v[i] = (real*) scalloc( total_cap, sizeof(real), "v[i]" ); + switch ( control->cm_solver_type ) + { + /* GMRES storage */ + case GMRES_S: + case GMRES_H_S: + workspace->y = (real*) scalloc( RESTART + 1, sizeof(real), "y" ); + workspace->z = (real*) scalloc( RESTART + 1, sizeof(real), "z" ); + workspace->g = (real*) scalloc( RESTART + 1, sizeof(real), "g" ); + workspace->h = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "h"); + workspace->hs = (real*) scalloc( RESTART + 1, sizeof(real), "hs" ); + workspace->hc = (real*) scalloc( RESTART + 1, sizeof(real), "hc" ); + workspace->v = (real *) scalloc ( (RESTART + 1) * (RESTART + 1), sizeof (real), "v"); + break; + + /* CG storage */ + case CG_S: + workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" ); + workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" ); + workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" ); + workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" ); + workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" ); + workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" ); + workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" ); + workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" ); + break; + + case SDM_S: + workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" ); + workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" ); + workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" ); + workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" ); + workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" ); + workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" ); + workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" ); + workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" ); + break; + + default: + fprintf( stderr, "Unknown charge method linear solver type. Terminating...\n" ); + exit( INVALID_INPUT ); + break; } - */ - - /* CG storage */ - workspace->r = (real*) scalloc( total_cap, sizeof(real), "r" ); - workspace->d = (real*) scalloc( total_cap, sizeof(real), "d" ); - workspace->q = (real*) scalloc( total_cap, sizeof(real), "q" ); - workspace->p = (real*) scalloc( total_cap, sizeof(real), "p" ); - workspace->r2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "r2" ); - workspace->d2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "d2" ); - workspace->q2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "q2" ); - workspace->p2 = (rvec2*) scalloc( total_cap, sizeof(rvec2), "p2" ); /* integrator storage */ workspace->v_const = (rvec*) smalloc( local_rvec, "v_const" ); diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c index 8f53b65d4b49a95fc68f4acf7698802e650137df..791804f6485ce0837a19a64dacc3ad395b812568 100644 --- a/PG-PuReMD/src/charges.c +++ b/PG-PuReMD/src/charges.c @@ -258,12 +258,12 @@ void ICHOLT( sparse_matrix *A, real *droptol, void Init_MatVec( reax_system *system, simulation_data *data, - control_params *control, storage *workspace, mpi_datatypes *mpi_data ) + control_params *control, storage *workspace, mpi_datatypes *mpi_data ) { int i; //, fillin; reax_atom *atom; -// if( (data->step - data->prev_steps) % control->refactor == 0 || +// if( (data->step - data->prev_steps) % control->cm_solver_pre_comp_refactor == 0 || // workspace->L == NULL ) // { //// Print_Linear_System( system, control, workspace, data->step ); @@ -308,12 +308,8 @@ void Init_MatVec( reax_system *system, simulation_data *data, { atom = &( system->my_atoms[i] ); - /* init pre-conditioner for H and init solution vectors */ + /* initialize diagonal inverse preconditioner vectors */ workspace->Hdia_inv[i] = 1. / system->reax_param.sbp[ atom->type ].eta; - workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi; - workspace->b_t[i] = -1.0; - workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi; - workspace->b[i][1] = -1.0; /* linear extrapolation for s and for t */ // newQEq: no extrapolation! @@ -334,6 +330,79 @@ void Init_MatVec( reax_system *system, simulation_data *data, // fprintf(stderr, "i=%d s=%f t=%f\n", i, workspace->s[i], workspace->t[i]); } + + /* initialize solution vectors for linear solves in charge method */ + switch ( control->charge_method ) + { + case QEQ_CM: + for ( i = 0; i < system->n; ++i ) + { + atom = &( system->my_atoms[i] ); + + workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi; + workspace->b_t[i] = -1.0; + workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi; + workspace->b[i][1] = -1.0; + } + break; + + case EE_CM: + for ( i = 0; i < system->n; ++i ) + { + atom = &( system->my_atoms[i] ); + + workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi; + + //TODO: check if unused (redundant) + workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi; + } + + if ( system->my_rank == 0 ) + { + workspace->b_s[system->n] = control->cm_q_net; + workspace->b[system->n][0] = control->cm_q_net; + } + break; + + case ACKS2_CM: + for ( i = 0; i < system->n; ++i ) + { + atom = &( system->my_atoms[i] ); + + workspace->b_s[i] = -system->reax_param.sbp[ atom->type ].chi; + + //TODO: check if unused (redundant) + workspace->b[i][0] = -system->reax_param.sbp[ atom->type ].chi; + } + + if ( system->my_rank == 0 ) + { + workspace->b_s[system->n] = control->cm_q_net; + workspace->b[system->n][0] = control->cm_q_net; + } + + for ( i = system->n + 1; i < system->N_cm; ++i ) + { + atom = &( system->my_atoms[i] ); + + workspace->b_s[i] = 0.0; + + //TODO: check if unused (redundant) + workspace->b[i][0] = 0.0; + } + + if ( system->my_rank == 0 ) + { + workspace->b_s[system->n] = control->cm_q_net; + workspace->b[system->n][0] = control->cm_q_net; + } + break; + + default: + fprintf( stderr, "Unknown charge method type. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } } @@ -359,7 +428,7 @@ void Calculate_Charges( reax_system *system, storage *workspace, } #if defined(DEBUG) - fprintf (stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1]); + fprintf( stderr, "Host : my_sum[0]: %f and %f \n", my_sum[0], my_sum[1] ); #endif MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world ); @@ -367,7 +436,7 @@ void Calculate_Charges( reax_system *system, storage *workspace, u = all_sum[0] / all_sum[1]; #if defined(DEBUG) - fprintf (stderr, "Host : u: %f \n", u); + fprintf( stderr, "Host : u: %f \n", u ); #endif for ( i = 0; i < system->n; ++i ) @@ -421,14 +490,14 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, //MATRIX CHANGES s_matvecs = dual_CG( system, workspace, &workspace->H, workspace->b, - control->q_err, workspace->x, mpi_data, out_control->log, data ); + control->cm_solver_q_err, workspace->x, mpi_data, out_control->log, data ); t_matvecs = 0; //fprintf (stderr, "Host: First CG complated with iterations: %d \n", s_matvecs); //s_matvecs = CG(system, workspace, workspace->H, workspace->b_s, //newQEq sCG - // control->q_err, workspace->s, mpi_data, out_control->log ); + // control->cm_solver_q_err, workspace->s, mpi_data, out_control->log ); //s_matvecs = PCG( system, workspace, workspace->H, workspace->b_s, - // control->q_err, workspace->L, workspace->U, workspace->s, + // control->cm_solver_q_err, workspace->L, workspace->U, workspace->s, // mpi_data, out_control->log ); #if defined(DEBUG) @@ -436,9 +505,9 @@ void QEq( reax_system *system, control_params *control, simulation_data *data, #endif //t_matvecs = CG(system, workspace, workspace->H, workspace->b_t, //newQEq sCG - // control->q_err, workspace->t, mpi_data, out_control->log ); + // control->cm_solver_q_err, workspace->t, mpi_data, out_control->log ); //t_matvecs = PCG( system, workspace, workspace->H, workspace->b_t, - // control->q_err, workspace->L, workspace->U, workspace->t, + // control->cm_solver_q_err, workspace->L, workspace->U, workspace->t, // mpi_data, out_control->log ); #if defined(DEBUG) diff --git a/PG-PuReMD/src/control.c b/PG-PuReMD/src/control.c index 268b6a1750874a63f3fbbc3a7e339582f5b231f4..5cb8ce6f1099fde86343d81d8c7b147d41bc4864 100644 --- a/PG-PuReMD/src/control.c +++ b/PG-PuReMD/src/control.c @@ -47,7 +47,7 @@ char Read_Control_File( char *control_file, control_params* control, } /* assign default values */ - strcpy( control->sim_name, "simulate" ); + strcpy( control->sim_name, "default.sim" ); control->ensemble = NVE; control->nsteps = 0; control->dt = 0.25; @@ -77,10 +77,21 @@ char Read_Control_File( char *control_file, control_params* control, control->tabulate = 0; + control->charge_method = QEQ_CM; control->charge_freq = 1; - control->q_err = 1e-6; - control->refactor = 100; - control->droptol = 1e-2;; + control->cm_q_net = 0.0; + control->cm_solver_type = GMRES_S; + control->cm_solver_max_iters = 100; + control->cm_solver_restart = 50; + control->cm_solver_q_err = 0.000001; + control->cm_domain_sparsify_enabled = FALSE; + control->cm_domain_sparsity = 1.0; + control->cm_solver_pre_comp_type = DIAG_PC; + control->cm_solver_pre_comp_sweeps = 3; + control->cm_solver_pre_comp_refactor = 100; + control->cm_solver_pre_comp_droptol = 0.01; + control->cm_solver_pre_app_type = TRI_SOLVE_PA; + control->cm_solver_pre_app_jacobi_iters = 50; control->T_init = 0.; control->T_final = 300.; @@ -247,25 +258,79 @@ char Read_Control_File( char *control_file, control_params* control, ival = atoi( tmp[1] ); control->tabulate = ival; } + else if ( strcmp(tmp[0], "charge_method") == 0 ) + { + ival = atoi( tmp[1] ); + control->charge_method = ival; + } else if ( strcmp(tmp[0], "charge_freq") == 0 ) { ival = atoi( tmp[1] ); control->charge_freq = ival; } - else if ( strcmp(tmp[0], "q_err") == 0 ) + else if ( strcmp(tmp[0], "cm_q_net") == 0 ) + { + val = atof( tmp[1] ); + control->cm_q_net = val; + } + else if ( strcmp(tmp[0], "cm_solver_type") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_type = ival; + } + else if ( strcmp(tmp[0], "cm_solver_max_iters") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_max_iters = ival; + } + else if ( strcmp(tmp[0], "cm_solver_restart") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_restart = ival; + } + else if ( strcmp(tmp[0], "cm_solver_q_err") == 0 ) + { + val = atof( tmp[1] ); + control->cm_solver_q_err = val; + } + else if ( strcmp(tmp[0], "cm_domain_sparsity") == 0 ) { val = atof( tmp[1] ); - control->q_err = val; + control->cm_domain_sparsity = val; + if ( val < 1.0 ) + { + control->cm_domain_sparsify_enabled = TRUE; + } } - else if ( strcmp(tmp[0], "ilu_refactor") == 0 ) + else if ( strcmp(tmp[0], "cm_solver_pre_comp_type") == 0 ) { ival = atoi( tmp[1] ); - control->refactor = ival; + control->cm_solver_pre_comp_type = ival; } - else if ( strcmp(tmp[0], "ilu_droptol") == 0 ) + else if ( strcmp(tmp[0], "cm_solver_pre_comp_refactor") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_pre_comp_refactor = ival; + } + else if ( strcmp(tmp[0], "cm_solver_pre_comp_droptol") == 0 ) { val = atof( tmp[1] ); - control->droptol = val; + control->cm_solver_pre_comp_droptol = val; + } + else if ( strcmp(tmp[0], "cm_solver_pre_comp_sweeps") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_pre_comp_sweeps = ival; + } + else if ( strcmp(tmp[0], "cm_solver_pre_app_type") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_pre_app_type = ival; + } + else if ( strcmp(tmp[0], "cm_solver_pre_app_jacobi_iters") == 0 ) + { + ival = atoi( tmp[1] ); + control->cm_solver_pre_app_jacobi_iters = ival; } else if ( strcmp(tmp[0], "temp_init") == 0 ) { diff --git a/PG-PuReMD/src/cub b/PG-PuReMD/src/cub index 01347a797c620618d09e7d2d90bce4be4c42513e..d622848f9fb62f13e5e064e1deb43b6bcbb12bad 160000 --- a/PG-PuReMD/src/cub +++ b/PG-PuReMD/src/cub @@ -1 +1 @@ -Subproject commit 01347a797c620618d09e7d2d90bce4be4c42513e +Subproject commit d622848f9fb62f13e5e064e1deb43b6bcbb12bad diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index 5df47e6f4fd55744b44b190d011f2e8a5b04f617..d683bba502aff5eae65bbd0f7e064768b62725d2 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -358,7 +358,6 @@ void dev_alloc_workspace( reax_system *system, control_params *control, */ /* bond order related storage */ - cuda_malloc( (void **) &workspace->within_bond_box, total_cap * sizeof (int), TRUE, "skin" ); cuda_malloc( (void **) &workspace->total_bond_order, total_real, TRUE, "total_bo" ); cuda_malloc( (void **) &workspace->Deltap, total_real, TRUE, "Deltap" ); cuda_malloc( (void **) &workspace->Deltap_boc, total_real, TRUE, "Deltap_boc" ); @@ -375,39 +374,58 @@ void dev_alloc_workspace( reax_system *system, control_params *control, cuda_malloc( (void **) &workspace->Clp, total_real, TRUE, "Clp" ); cuda_malloc( (void **) &workspace->vlpex, total_real, TRUE, "vlpex" ); cuda_malloc( (void **) &workspace->bond_mark, total_real, TRUE, "bond_mark" ); - cuda_malloc( (void **) &workspace->done_after, total_real, TRUE, "done_after" ); - /* charge matrix storage */ - cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" ); + if ( control->cm_solver_pre_comp_type == DIAG_PC ) + { + cuda_malloc( (void **) &workspace->Hdia_inv, total_cap * sizeof(real), TRUE, "Hdia_inv" ); + } cuda_malloc( (void **) &workspace->b_s, total_cap * sizeof(real), TRUE, "b_s" ); cuda_malloc( (void **) &workspace->b_t, total_cap * sizeof(real), TRUE, "b_t" ); cuda_malloc( (void **) &workspace->b_prc, total_cap * sizeof(real), TRUE, "b_prc" ); cuda_malloc( (void **) &workspace->b_prm, total_cap * sizeof(real), TRUE, "b_prm" ); cuda_malloc( (void **) &workspace->s, total_cap * sizeof(real), TRUE, "s" ); cuda_malloc( (void **) &workspace->t, total_cap * sizeof(real), TRUE, "t" ); - cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" ); + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + cuda_malloc( (void **) &workspace->droptol, total_cap * sizeof(real), TRUE, "droptol" ); + } cuda_malloc( (void **) &workspace->b, total_cap * sizeof(rvec2), TRUE, "b" ); cuda_malloc( (void **) &workspace->x, total_cap * sizeof(rvec2), TRUE, "x" ); - /* GMRES storage */ - cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" ); - cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" ); - cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" ); - cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" ); - cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" ); - cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" ); - cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" ); - - /* CG storage */ - cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" ); - cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" ); - cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" ); - cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" ); - cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" ); - cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" ); - cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" ); - cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" ); + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + cuda_malloc( (void **) &workspace->y, (RESTART+1)*sizeof(real), TRUE, "y" ); + cuda_malloc( (void **) &workspace->z, (RESTART+1)*sizeof(real), TRUE, "z" ); + cuda_malloc( (void **) &workspace->g, (RESTART+1)*sizeof(real), TRUE, "g" ); + cuda_malloc( (void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "h" ); + cuda_malloc( (void **) &workspace->hs, (RESTART+1)*sizeof(real), TRUE, "hs" ); + cuda_malloc( (void **) &workspace->hc, (RESTART+1)*sizeof(real), TRUE, "hc" ); + cuda_malloc( (void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof(real), TRUE, "v" ); + break; + + case SDM_S: + break; + + case CG_S: + cuda_malloc( (void **) &workspace->r, total_cap * sizeof(real), TRUE, "r" ); + cuda_malloc( (void **) &workspace->d, total_cap * sizeof(real), TRUE, "d" ); + cuda_malloc( (void **) &workspace->q, total_cap * sizeof(real), TRUE, "q" ); + cuda_malloc( (void **) &workspace->p, total_cap * sizeof(real), TRUE, "p" ); + cuda_malloc( (void **) &workspace->r2, total_cap * sizeof(rvec2), TRUE, "r2" ); + cuda_malloc( (void **) &workspace->d2, total_cap * sizeof(rvec2), TRUE, "d2" ); + cuda_malloc( (void **) &workspace->q2, total_cap * sizeof(rvec2), TRUE, "q2" ); + cuda_malloc( (void **) &workspace->p2, total_cap * sizeof(rvec2), TRUE, "p2" ); + break; + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } /* integrator storage */ cuda_malloc( (void **) &workspace->v_const, local_rvec, TRUE, "v_const" ); @@ -458,7 +476,6 @@ void dev_dealloc_workspace( control_params *control, storage *workspace ) */ /* bond order related storage */ - cuda_free( workspace->within_bond_box, "skin" ); cuda_free( workspace->total_bond_order, "total_bo" ); cuda_free( workspace->Deltap, "Deltap" ); cuda_free( workspace->Deltap_boc, "Deltap_boc" ); @@ -475,38 +492,58 @@ void dev_dealloc_workspace( control_params *control, storage *workspace ) cuda_free( workspace->Clp, "Clp" ); cuda_free( workspace->vlpex, "vlpex" ); cuda_free( workspace->bond_mark, "bond_mark" ); - cuda_free( workspace->done_after, "done_after" ); /* charge matrix storage */ - cuda_free( workspace->Hdia_inv, "Hdia_inv" ); + if ( control->cm_solver_pre_comp_type == DIAG_PC ) + { + cuda_free( workspace->Hdia_inv, "Hdia_inv" ); + } cuda_free( workspace->b_s, "b_s" ); cuda_free( workspace->b_t, "b_t" ); cuda_free( workspace->b_prc, "b_prc" ); cuda_free( workspace->b_prm, "b_prm" ); cuda_free( workspace->s, "s" ); cuda_free( workspace->t, "t" ); - cuda_free( workspace->droptol, "droptol" ); + if ( control->cm_solver_pre_comp_type == ICHOLT_PC || + control->cm_solver_pre_comp_type == ILUT_PAR_PC ) + { + cuda_free( workspace->droptol, "droptol" ); + } cuda_free( workspace->b, "b" ); cuda_free( workspace->x, "x" ); - /* GMRES storage */ - cuda_free( workspace->y, "y" ); - cuda_free( workspace->z, "z" ); - cuda_free( workspace->g, "g" ); - cuda_free( workspace->h, "h" ); - cuda_free( workspace->hs, "hs" ); - cuda_free( workspace->hc, "hc" ); - cuda_free( workspace->v, "v" ); - - /* CG storage */ - cuda_free( workspace->r, "r" ); - cuda_free( workspace->d, "d" ); - cuda_free( workspace->q, "q" ); - cuda_free( workspace->p, "p" ); - cuda_free( workspace->r2, "r2" ); - cuda_free( workspace->d2, "d2" ); - cuda_free( workspace->q2, "q2" ); - cuda_free( workspace->p2, "p2" ); + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + cuda_free( workspace->y, "y" ); + cuda_free( workspace->z, "z" ); + cuda_free( workspace->g, "g" ); + cuda_free( workspace->h, "h" ); + cuda_free( workspace->hs, "hs" ); + cuda_free( workspace->hc, "hc" ); + cuda_free( workspace->v, "v" ); + break; + + case SDM_S: + break; + + case CG_S: + cuda_free( workspace->r, "r" ); + cuda_free( workspace->d, "d" ); + cuda_free( workspace->q, "q" ); + cuda_free( workspace->p, "p" ); + cuda_free( workspace->r2, "r2" ); + cuda_free( workspace->d2, "d2" ); + cuda_free( workspace->q2, "q2" ); + cuda_free( workspace->p2, "p2" ); + break; + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } /* integrator storage */ cuda_free( workspace->v_const, "v_const" ); diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 9ea976537c2527ae52616edab5e5b367615f81e1..c22058e9e81af68be9e7da236ca27c8e62eae10c 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -24,7 +24,9 @@ #include "cuda_lin_alg.h" #include "cuda_reduction.h" #include "cuda_utils.h" -#include "cuda_validation.h" +#if defined(DEBUG) + #include "cuda_validation.h" +#endif #include "../basic_comm.h" @@ -259,32 +261,114 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); //} -//#ifdef __CUDA_DEBUG__ -// Init_MatVec( system, data, control, workspace, mpi_data ); -//#endif + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + case SDM_S: + fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + case CG_S: + s_matvecs = Cuda_dual_CG( system, control, workspace, &dev_workspace->H, + dev_workspace->b, control->cm_solver_q_err, dev_workspace->x, mpi_data, + out_control->log, data ); + t_matvecs = 0; + break; + + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } -#if defined(DEBUG) - fprintf( stderr, "p%d: initialized qEq\n", system->my_rank ); - //Print_Linear_System( system, control, workspace, data->step ); + Cuda_Calculate_Charges( system, workspace, mpi_data ); + +#if defined(LOG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + data->timing.s_matvecs += s_matvecs; + data->timing.t_matvecs += t_matvecs; + } #endif +} - //MATRIX CHANGES - s_matvecs = Cuda_dual_CG( system, workspace, &dev_workspace->H, - dev_workspace->b, control->q_err, dev_workspace->x, mpi_data, - out_control->log, data ); - t_matvecs = 0; - //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs); -#if defined(DEBUG) - fprintf( stderr, "p%d: first CG completed\n", system->my_rank ); -#endif +void Cuda_EE( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + int s_matvecs, t_matvecs; + + Cuda_Init_MatVec( system, workspace ); + + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + case SDM_S: + fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + case CG_S: + s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H, + dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data ); + t_matvecs = 0; + break; + + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } Cuda_Calculate_Charges( system, workspace, mpi_data ); -#if defined(DEBUG) - fprintf( stderr, "p%d: computed charges\n", system->my_rank ); - //Print_Charges( system ); +#if defined(LOG_PERFORMANCE) + if ( system->my_rank == MASTER_NODE ) + { + data->timing.s_matvecs += s_matvecs; + data->timing.t_matvecs += t_matvecs; + } #endif +} + + +void Cuda_ACKS2( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + int s_matvecs, t_matvecs; + + Cuda_Init_MatVec( system, workspace ); + + switch ( control->cm_solver_type ) + { + case GMRES_S: + case GMRES_H_S: + case SDM_S: + fprintf( stderr, "Unsupported QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + + case CG_S: + s_matvecs = Cuda_CG( system, control, workspace, &dev_workspace->H, + dev_workspace->b_s, control->cm_solver_q_err, dev_workspace->s, mpi_data ); + t_matvecs = 0; + break; + + + default: + fprintf( stderr, "Unrecognized QEq solver selection. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } + + Cuda_Calculate_Charges( system, workspace, mpi_data ); #if defined(LOG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) @@ -294,3 +378,29 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data } #endif } + + +void Cuda_Compute_Charges( reax_system *system, control_params *control, simulation_data + *data, storage *workspace, output_controls *out_control, mpi_datatypes + *mpi_data ) +{ + switch ( control->charge_method ) + { + case QEQ_CM: + Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); + break; + + case EE_CM: + Cuda_EE( system, control, data, workspace, out_control, mpi_data ); + break; + + case ACKS2_CM: + Cuda_ACKS2( system, control, data, workspace, out_control, mpi_data ); + break; + + default: + fprintf( stderr, "Invalid charge method. Terminating...\n" ); + exit( INVALID_INPUT ); + break; + } +} diff --git a/PG-PuReMD/src/cuda/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h index d1922a48b83a6df1b27a14efb38e63d153f89111..dc8c16b601820def6227da5710662305dee7c49b 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.h +++ b/PG-PuReMD/src/cuda/cuda_charges.h @@ -37,7 +37,7 @@ void cuda_charges_st( reax_system *, storage *, real *, real ); void cuda_charges_updateq( reax_system *, real * ); -void Cuda_QEq( reax_system*, control_params*, simulation_data*, +void Cuda_Compute_Charges( reax_system*, control_params*, simulation_data*, storage*, output_controls*, mpi_datatypes* ); diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index 4960038cb48d0c2c2a127f05ad1146d63f7ebf8a..bc86bf41cc30e7a7d43fd57c2585991db34c8263 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -15,7 +15,9 @@ #include "cuda_torsion_angles.h" #include "cuda_utils.h" #include "cuda_valence_angles.h" -#include "cuda_validation.h" +#if defined(DEBUG) + #include "cuda_validation.h" +#endif #include "../basic_comm.h" #include "../forces.h" @@ -212,7 +214,7 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, int local; int my_bonds, my_hbonds, my_cm_entries; real cutoff; - real r_ij, r2; + real r_ij; real C12, C34, C56; real BO, BO_s, BO_pi, BO_pi2; single_body_parameters *sbp_i, *sbp_j; @@ -334,8 +336,6 @@ CUDA_GLOBAL void k_estimate_storages( reax_atom *my_atoms, /* uncorrected bond orders */ if ( nbr_pj->d <= control->bond_cut ) { - r2 = SQR( r_ij ); - if ( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0 ) { C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); @@ -1074,6 +1074,7 @@ CUDA_GLOBAL void k_update_hbonds( reax_atom *my_atoms, reax_list hbonds, int n ) } +#if defined(DEBUG) CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n ) { int i; @@ -1151,6 +1152,7 @@ static void Print_HBonds( reax_system *system, int step ) cudaThreadSynchronize( ); cudaCheckError( ); } +#endif CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, @@ -1345,7 +1347,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, simulation_data *data, storage *workspace, reax_list **lists, output_controls *out_control ) { - int hbs, hnbrs_blocks, update_energy, ret; + int update_energy, ret; +// int hbs, hnbrs_blocks; int *thbody; static int compute_bonded_part1 = FALSE; real *spad = (real *) scratch; @@ -1643,8 +1646,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cuda_memset( spad, 0, 2 * sizeof(real) * system->n + sizeof(rvec) * system->n * 2, "scratch" ); - hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + - (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); +// hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + +// (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>> // Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, @@ -1694,8 +1697,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError( ); /* post process step2 */ - hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) + - (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1); +// hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) + +// (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1); Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>> ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) ); @@ -1848,7 +1851,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, #if defined(PURE_REAX) if ( charge_flag == TRUE ) { - Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); + Cuda_Compute_Charges( system, control, data, workspace, out_control, mpi_data ); } #if defined(LOG_PERFORMANCE) @@ -1898,7 +1901,7 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: total forces computed\n", system->my_rank, data->step ); - //Print_Total_Force( system, data, workspace ); +// Print_Total_Force( system, data, workspace ); MPI_Barrier( MPI_COMM_WORLD ); #endif diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu index b115f2870de362525282cbaea220c762b706baf1..81b9b3e13f82f5eb0755fc4c3980738e99be324b 100644 --- a/PG-PuReMD/src/cuda/cuda_init_md.cu +++ b/PG-PuReMD/src/cuda/cuda_init_md.cu @@ -10,7 +10,9 @@ #include "cuda_reset_tools.h" #include "cuda_system_props.h" #include "cuda_utils.h" -#include "cuda_validation.h" +#if defined(DEBUG) + #include "cuda_validation.h" +#endif #if defined(PURE_REAX) #include "../box.h" diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu index d47a08562b9993f8cd2d760069eeb802a010bd8f..1b51216ac4f142c328c6e1469d3661e1a7bb7ebd 100644 --- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu @@ -336,7 +336,7 @@ void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count ) } -CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) +CUDA_GLOBAL void k_diagonal_preconditioner( storage p_workspace, rvec2 *b, int n ) { storage *workspace; int j; @@ -350,15 +350,13 @@ CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n) workspace = &( p_workspace ); - //for( j = 0; j < system->n; ++j ) { - // residual + /* compute residuals */ workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; workspace->r2[j][1] = b[j][1] - workspace->q2[j][1]; - // apply diagonal pre-conditioner + /* apply diagonal preconditioner to residuals */ workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; - //} } @@ -370,7 +368,7 @@ void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n ) (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1); k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>> - (*workspace, b, n); + ( *workspace, b, n ); cudaThreadSynchronize( ); cudaCheckError( ); @@ -396,7 +394,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, alpha[1] = alpha_1; my_dot[j][0] = my_dot[j][1] = 0.0; - //for( j = 0; j < system->n; ++j ) { // update x x[j][0] += alpha[0] * workspace->d2[j][0]; x[j][1] += alpha[1] * workspace->d2[j][1]; @@ -412,7 +409,6 @@ CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, // dot product: r.p my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0]; my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1]; - //} } @@ -433,7 +429,7 @@ void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha, cudaThreadSynchronize( ); cudaCheckError( ); - //Reduction to calculate my_dot + /* reduction to calculate my_dot */ k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>> ( tmp, tmp + n, n); @@ -515,9 +511,11 @@ void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n) CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; - if (i >= n) + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) { return; } @@ -542,9 +540,11 @@ void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n ) CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= n) + if ( i >= n ) { return; } @@ -598,35 +598,35 @@ void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size ) void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size ) { - int blocks; +// int blocks; - blocks = (n / DEF_BLOCK_SIZE) + - (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); +// blocks = (n / DEF_BLOCK_SIZE) + +// (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1); cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" ); - //one thread per row implementation - //k_matvec <<< blocks, DEF_BLOCK_SIZE >>> - // (*H, a, b, n); - //cudaThreadSynchronize (); - //cudaCheckError (); + /* one thread per row implementation */ +// k_matvec <<< blocks, DEF_BLOCK_SIZE >>> +// ( *H, a, b, n ); +// cudaThreadSynchronize( ); +// cudaCheckError( ); #if defined(__SM_35__) k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>> #else k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, - sizeof(real) * MATVEC_BLOCK_SIZE>>> + sizeof(real) * MATVEC_BLOCK_SIZE >>> #endif - (*H, a, b, n); + ( *H, a, b, n ); cudaThreadSynchronize( ); cudaCheckError( ); } -int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, - rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, - simulation_data *data ) +int Cuda_dual_CG( reax_system *system, control_params *control, storage *workspace, + sparse_matrix *H, rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, + FILE *fout, simulation_data *data ) { int i, n, matvecs, scale; // int j, N; @@ -651,46 +651,14 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } #endif - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer ); -//#endif - -// check_zeros_device( x, system->N, "x" ); - copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" ); Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" ); -// check_zeros_device( x, system->N, "x" ); - -// compare_rvec2 (workspace->x, x, N, "x"); -// if (data->step > 0) { -// compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b"); -// compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x"); -// -// exit (0); -// } - - -//#ifdef __CUDA_DEBUG__ -// dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N ); -//#endif //originally we were using only H->n which was system->n (init_md.c) //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap); - Cuda_Dual_Matvec( H, x, dev_workspace->q2, system->N, system->total_cap ); -// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - -// if (data->step > 0) exit (0); - - // tryQEq - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); -//#endif - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:q2:get" ); Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); @@ -704,63 +672,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } #endif -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // residual -// workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0]; -// workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1]; -// // apply diagonal pre-conditioner -// workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; -// workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; -// } -//#endif - Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n ); -// compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2"); -// compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2"); - - /* norm of b */ -//#ifdef __CUDA_DEBUG__ -// my_sum[0] = my_sum[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_sum[0] += SQR( workspace->b[j][0] ); -// my_sum[1] += SQR( workspace->b[j][1] ); -// } -// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); -//#endif - my_sum[0] = 0; my_sum[1] = 0; Cuda_Norm( b, n, my_sum ); -// fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]); - MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm ); b_norm[0] = SQRT( norm_sqr[0] ); b_norm[1] = SQRT( norm_sqr[1] ); - //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] ); /* dot product: r.d */ -//#ifdef __CUDA_DEBUG__ -// my_dot[0] = my_dot[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0]; -// my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1]; -// } -// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - my_dot[0] = 0; my_dot[1] = 0; Cuda_Dot( dev_workspace->r2, dev_workspace->d2, my_dot, n ); - -// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] ); MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -768,55 +696,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } #endif - for ( i = 1; i < 1000; ++i ) + for ( i = 1; i < control->cm_solver_max_iters; ++i ) { - //MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer); -//#endif - copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "cg:d2:get" ); Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer ); copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "cg:d2:put" ); - //print_device_rvec2 (dev_workspace->d2, N); - -//#ifdef __CUDA_DEBUG__ -// dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N ); -//#endif - Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N, system->total_cap ); - /* - fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n ); - fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n"); - print_sparse_matrix_host (&workspace->H); - fprintf (stderr, " ******* HOST Vector ***************\n"); - print_host_rvec2 (workspace->d2, system->N); - fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n"); - print_sparse_matrix (&dev_workspace->H); - fprintf (stderr, " ******* Device Vector ***************\n"); - print_device_rvec2 (dev_workspace->d2, system->N); - */ - //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - - // tryQEq - // MVAPICH2 -//#ifdef __CUDA_DEBUG__ -// Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker); -//#endif - copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "cg:q2:get" ); Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker ); copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "cg:q2:put" ); -// compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2"); - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -825,61 +721,23 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, #endif /* dot product: d.q */ -//#ifdef __CUDA_DEBUG__ -// my_dot[0] = my_dot[1] = 0; -// for( j = 0; j < n; ++j ) { -// my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0]; -// my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1]; -// } -// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = my_dot[1] = 0; + my_dot[0] = 0; + my_dot[1] = 0; Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n); - //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] ); alpha[0] = sig_new[0] / tmp[0]; alpha[1] = sig_new[1] / tmp[1]; my_dot[0] = 0; my_dot[1] = 0; -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // update x -// workspace->x[j][0] += alpha[0] * workspace->d2[j][0]; -// workspace->x[j][1] += alpha[1] * workspace->d2[j][1]; -// // update residual -// workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; -// workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; -// // apply diagonal pre-conditioner -// workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; -// workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; -// // dot product: r.p -// my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0]; -// my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1]; -// } -// fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] ); -//#endif - - my_dot[0] = 0; - my_dot[1] = 0; Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot ); - //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] ); - -// compare_rvec2 (workspace->x, dev_workspace->x, N, "x"); -// compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2"); -// compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2"); - sig_old[0] = sig_new[0]; sig_old[1] = sig_new[1]; MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); - //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] ); - #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) { @@ -895,64 +753,30 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; -//#ifdef __CUDA_DEBUG__ -// for( j = 0; j < system->n; ++j ) { -// // d = p + beta * d -// workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0]; -// workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1]; -// } -//#endif - Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta, dev_workspace->d2, system->n ); - -// compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2"); } - if ( SQRT(sig_new[0]) / b_norm[0] <= tol ) { - //for( j = 0; j < n; ++j ) - // workspace->t[j] = workspace->x[j][1]; - //fprintf (stderr, "Getting started with Cuda_CG1 \n"); - Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n ); - //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t"); - //compare_array (workspace->t, dev_workspace->t, system->n, "t"); - - matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t, - mpi_data, fout ); - - //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs ); - //for( j = 0; j < n; ++j ) - // workspace->x[j][1] = workspace->t[j]; + matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_t, tol, dev_workspace->t, + mpi_data ); Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n ); } else if ( SQRT(sig_new[1]) / b_norm[1] <= tol ) { - //for( j = 0; j < n; ++j ) - // workspace->s[j] = workspace->x[j][0]; - Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n ); - //compare_array (workspace->s, dev_workspace->s, system->n, "s"); - //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s"); - - //fprintf (stderr, "Getting started with Cuda_CG2 \n"); - - matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s, - mpi_data, fout ); - - //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs ); - //for( j = 0; j < system->n; ++j ) - // workspace->x[j][0] = workspace->s[j]; + matvecs = Cuda_CG( system, control, workspace, H, dev_workspace->b_s, tol, dev_workspace->s, + mpi_data ); Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n ); } - if ( i >= 1000 ) + if ( i >= control->cm_solver_max_iters ) { fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n", system->my_rank, i ); @@ -972,8 +796,8 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } -int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real - *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) +int Cuda_CG( reax_system *system, control_params *control, storage *workspace, + sparse_matrix *H, real *b, real tol, real *x, mpi_datatypes* mpi_data ) { int i, scale; // int j; @@ -983,25 +807,19 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real scale = sizeof(real) / sizeof(void); - /* x is on the device */ - //MVAPICH2 memset( spad, 0, sizeof(real) * system->total_cap ); copy_host_device( spad, x, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:x:get" ); Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); - //MVAPICH2 copy_host_device( spad, x, sizeof(real) * system->total_cap, cudaMemcpyHostToDevice, "cuda_cg:x:put" ); Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap ); - // tryQEq - // MVAPICH2 copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); - //MVAPICH2 copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, cudaMemcpyHostToDevice, "cuda_cg:q:put" ); @@ -1014,8 +832,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real Cuda_Vector_Sum( dev_workspace->r , 1., b, -1., dev_workspace->q, system->n ); - //for( j = 0; j < system->n; ++j ) - // workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r, dev_workspace->Hdia_inv, system->n ); @@ -1039,9 +856,8 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real } #endif - for ( i = 1; i < 1000 && SQRT(sig_new) / b_norm > tol; ++i ) + for ( i = 1; i < control->cm_solver_max_iters && SQRT(sig_new) / b_norm > tol; ++i ) { - //MVAPICH2 copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:d:get" ); Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer ); @@ -1050,7 +866,6 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap ); - //tryQEq copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap, cudaMemcpyDeviceToHost, "cuda_cg:q:get" ); Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker ); @@ -1078,9 +893,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n ); Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0, dev_workspace->r, system->n ); - /* pre-conditioning */ - //for( j = 0; j < system->n; ++j ) - // workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r, dev_workspace->Hdia_inv, system->n ); @@ -1092,7 +905,6 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n, cudaMemcpyDeviceToHost, "cuda_cg:p:get" ); sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world ); - //fprintf (stderr, "Device: sig_new: %f \n", sig_new ); beta = sig_new / sig_old; Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta, diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.h b/PG-PuReMD/src/cuda/cuda_lin_alg.h index aa31c126642b8eb560256390c89698df99b34a73..768c0a36a1582a585f5d45161844a23640a68212 100644 --- a/PG-PuReMD/src/cuda/cuda_lin_alg.h +++ b/PG-PuReMD/src/cuda/cuda_lin_alg.h @@ -51,11 +51,11 @@ void Cuda_Dual_Matvec( sparse_matrix *, rvec2 *, rvec2 *, int , int ); void Cuda_Matvec( sparse_matrix *, real *, real *, int , int ); -int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*, +int Cuda_dual_CG( reax_system*, control_params*, storage*, sparse_matrix*, rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * ); -int Cuda_CG( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); +int Cuda_CG( reax_system*, control_params*, storage*, sparse_matrix*, + real*, real, real*, mpi_datatypes* ); #ifdef __cplusplus } diff --git a/PG-PuReMD/src/cuda/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu index 01bd3c8199a76144703738188713190e02b35e9f..49224a06aec4157eed74a977ec4b99cf22c5cdc7 100644 --- a/PG-PuReMD/src/cuda/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda/cuda_reduction.cu @@ -662,8 +662,8 @@ CUDA_GLOBAL void k_norm_rvec2( const rvec2 *input, rvec2 *per_block_results, } -CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res, - const size_t n) +CUDA_GLOBAL void k_dot_rvec2( const rvec2 *a, rvec2 *b, rvec2 *res, + const size_t n ) { #if defined(__SM_35__) extern __shared__ rvec2 my_dot2[]; @@ -746,11 +746,13 @@ CUDA_GLOBAL void k_dot_rvec2(const rvec2 *a, rvec2 *b, rvec2 *res, //vector functions ////////////////////////////////////////////////// CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, - int k ) + int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if( i >= k ) + if ( i >= n ) { return; } @@ -759,11 +761,13 @@ CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, } -CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k ) +CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if( i >= k ) + if ( i >= n ) { return; } @@ -772,11 +776,13 @@ CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k ) } -CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k ) +CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; - if( i >= k ) + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= n ) { return; } @@ -787,12 +793,13 @@ CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k ) CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *dest, rvec2 *a, - real beta0, real beta1, - rvec2 *b, int n ) + real beta0, real beta1, rvec2 *b, int n ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; - if( i >= n ) + if ( i >= n ) { return; } diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c index d3e295cf392f45f400596f041f1995f93ed2e02f..91fbfd1792f479c8810b783877a837c7d3ec93c6 100644 --- a/PG-PuReMD/src/forces.c +++ b/PG-PuReMD/src/forces.c @@ -800,7 +800,6 @@ int Init_Forces( reax_system *system, control_params *control, { /* put ghost atoms to an infinite distance */ workspace->bond_mark[i] = 1000; - //workspace->done_after[i] = Start_Index( i, far_nbrs ); } H = &(workspace->H); //MATRIX CHANGES @@ -969,8 +968,6 @@ int Init_Forces( reax_system *system, control_params *control, else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) { workspace->bond_mark[i] = workspace->bond_mark[j] + 1; - //if( workspace->bond_mark[i] == 1000 ) - // workspace->done_after[i] = pj; } //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n", @@ -1005,7 +1002,6 @@ int Init_Forces( reax_system *system, control_params *control, /*for( i = system->n; i < system->N; ++i ) { start_i = Start_Index(i, far_nbrs); - end_i = workspace->done_after[i]; if( workspace->bond_mark[i] >= 2 && start_i < end_i ) { atom_i = &(system->my_atoms[i]); @@ -1088,7 +1084,6 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control, for ( i = system->n; i < system->N; ++i ) { workspace->bond_mark[i] = 1000; // put ghost atoms to an infinite distance - //workspace->done_after[i] = Start_Index( i, far_nbrs ); } num_bonds = 0; @@ -1214,15 +1209,13 @@ int Init_Forces_No_Charges( reax_system *system, control_params *control, ++btop_i; if ( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 ) + { workspace->bond_mark[j] = workspace->bond_mark[i] + 1; + } else if ( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) { workspace->bond_mark[i] = workspace->bond_mark[j] + 1; - //if( workspace->bond_mark[i] == 1000 ) - // workspace->done_after[i] = pj; } - //fprintf( stdout, "%d%d - %d(%d) %d(%d)\n", - // i , j, i, workspace->bond_mark[i], j, workspace->bond_mark[j] ); } } } diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c index 3d27ecf6ec83c3d1da8b298e37301c6944bc7c6a..d071f0dd3e5175ca55bc00d36a266344b41bd059 100644 --- a/PG-PuReMD/src/init_md.c +++ b/PG-PuReMD/src/init_md.c @@ -117,16 +117,16 @@ void Generate_Initial_Velocities( reax_system *system, real T ) rvec_Scale( system->my_atoms[i].v, 1. / scale, system->my_atoms[i].v ); - // fprintf( stderr, "v = %f %f %f\n", - // system->my_atoms[i].v[0], - // system->my_atoms[i].v[1], - // system->my_atoms[i].v[2] ); - - // fprintf( stderr, "scale = %f\n", scale ); - // fprintf( stderr, "v = %f %f %f\n", - // system->my_atoms[i].v[0], - // system->my_atoms[i].v[1], - // system->my_atoms[i].v[2] ); +// fprintf( stderr, "v = %f %f %f\n", +// system->my_atoms[i].v[0], +// system->my_atoms[i].v[1], +// system->my_atoms[i].v[2] ); +// +// fprintf( stderr, "scale = %f\n", scale ); +// fprintf( stderr, "v = %f %f %f\n", +// system->my_atoms[i].v[0], +// system->my_atoms[i].v[1], +// system->my_atoms[i].v[2] ); } } } diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c index e9ce62e72411dd1e7739d79b3a2875476b63ed48..5de64a8a8d242c8d5968594878d7add6b64c6ac6 100644 --- a/PG-PuReMD/src/lin_alg.c +++ b/PG-PuReMD/src/lin_alg.c @@ -28,7 +28,7 @@ #include "tool_box.h" #include "vector.h" -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) && defined(DEBUG) #include "cuda/cuda_validation.h" #endif @@ -44,7 +44,8 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) for ( i = 0; i < N; ++i ) { - b[i][0] = b[i][1] = 0; + b[i][0] = 0; + b[i][1] = 0; } /* perform multiplication */ @@ -76,14 +77,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data ) { - int i, j, n, N, matvecs, scale; + int i, j, n, N, matvecs, scale; rvec2 tmp, alpha, beta; rvec2 my_sum, norm_sqr, b_norm, my_dot; rvec2 sig_old, sig_new; MPI_Comm comm; - int a; - n = system->n; N = system->N; comm = mpi_data->world; @@ -99,14 +98,14 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 } #endif -#ifdef HAVE_CUDA - check_zeros_host( x, system->N, "x" ); +#if defined(HAVE_CUDA) && defined(DEBUG) + check_zeros_host( x, N, "x" ); #endif Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); -#ifdef HAVE_CUDA - check_zeros_host( x, system->N, "x" ); +#if defined(HAVE_CUDA) && defined(DEBUG) + check_zeros_host( x, N, "x" ); #endif dual_Sparse_MatVec( H, x, workspace->q2, N ); @@ -121,7 +120,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 Update_Timing_Info( &t_start, &matvec_time ); #endif - for ( j = 0; j < system->n; ++j ) + for ( j = 0; j < n; ++j ) { /* residual */ workspace->r2[j][0] = b[j][0] - workspace->q2[j][0]; @@ -168,7 +167,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 for ( i = 1; i < 300; ++i ) { Dist(system, mpi_data, workspace->d2, mpi_data->mpi_rvec2, scale, rvec2_packer); - //print_host_rvec2 (workspace->d2, N); + //print_host_rvec2( workspace->d2, N ); dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N ); @@ -193,7 +192,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 alpha[0] = sig_new[0] / tmp[0]; alpha[1] = sig_new[1] / tmp[1]; my_dot[0] = my_dot[1] = 0; - for ( j = 0; j < system->n; ++j ) + for ( j = 0; j < n; ++j ) { /* update x */ x[j][0] += alpha[0] * workspace->d2[j][0]; @@ -225,7 +224,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; - for ( j = 0; j < system->n; ++j ) + for ( j = 0; j < n; ++j ) { /* d = p + beta * d */ workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0]; @@ -240,7 +239,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 workspace->t[j] = workspace->x[j][1]; } matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t, - mpi_data, fout ); + mpi_data ); #if defined(DEBUG) fprintf (stderr, " CG1: iterations --> %d \n", matvecs ); @@ -258,13 +257,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2 workspace->s[j] = workspace->x[j][0]; } matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s, - mpi_data, fout ); + mpi_data ); #if defined(DEBUG) fprintf (stderr, " CG2: iterations --> %d \n", matvecs ); #endif - for ( j = 0; j < system->n; ++j ) + for ( j = 0; j < n; ++j ) { workspace->x[j][0] = workspace->s[j]; } @@ -313,7 +312,7 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b, - real tol, real *x, mpi_datatypes* mpi_data, FILE *fout) + real tol, real *x, mpi_datatypes* mpi_data ) { int i, j, scale; real tmp, alpha, beta, b_norm; @@ -762,7 +761,7 @@ int sCG( reax_system *system, storage *workspace, sparse_matrix *H, int GMRES( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) + real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; @@ -883,8 +882,8 @@ int GMRES( reax_system *system, storage *workspace, sparse_matrix *H, int GMRES_HouseHolder( reax_system *system, storage *workspace, - sparse_matrix *H, real *b, real tol, real *x, - mpi_datatypes* mpi_data, FILE *fout ) + sparse_matrix *H, real *b, real tol, real *x, + mpi_datatypes* mpi_data, FILE *fout ) { int i, j, k, itr, N; real cc, tmp1, tmp2, temp, bnorm; diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h index 3663978e4b6cc412a1b9d8fe2cc613443383c3fc..4b7ba2f0368e81d498f93d91e33db074ebddd9ca 100644 --- a/PG-PuReMD/src/lin_alg.h +++ b/PG-PuReMD/src/lin_alg.h @@ -36,10 +36,10 @@ int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*, real*, real, real*, mpi_datatypes*, FILE* ); int dual_CG( reax_system*, storage*, sparse_matrix*, - rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *); + rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * ); int CG( reax_system*, storage*, sparse_matrix*, - real*, real, real*, mpi_datatypes*, FILE* ); + real*, real, real*, mpi_datatypes* ); int PCG( reax_system*, storage*, sparse_matrix*, real*, real, sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* ); diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index 9ace1c3276e9598f40d5567460f9c5a57db0955f..da9074127fec719403a9ea0e892961314c7a7b29 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -49,9 +49,12 @@ #include "cuda/cuda_reset_tools.h" #include "cuda/cuda_system_props.h" #include "cuda/cuda_utils.h" - #include "cuda/cuda_validation.h" + #if defined(DEBUG) + #include "cuda/cuda_validation.h" + #endif #endif + evolve_function Evolve; evolve_function Cuda_Evolve; LR_lookup_table *LR; diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index a6d16f8ef08ef7c40d2770d2f466b2089a468d1a..80a4dafc7b12dc1ca06fcb63dbd090f1b8bef638 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -277,24 +277,6 @@ /******************* ENUMERATIONS *************************/ -/* geometry file format */ -enum geo_formats -{ - CUSTOM = 0, - PDB = 1, - ASCII_RESTART = 2, - BINARY_RESTART = 3, - GF_N = 4, -}; - -/* restart file format */ -enum restart_formats -{ - WRITE_ASCII = 0, - WRITE_BINARY = 1, - RF_N = 2, -}; - /* ensemble type */ enum ensembles { @@ -369,6 +351,57 @@ enum errors RUNTIME_ERROR = -19, }; +/* restart file format */ +enum restart_formats +{ + WRITE_ASCII = 0, + WRITE_BINARY = 1, + RF_N = 2, +}; + +/* geometry file format */ +enum geo_formats +{ + CUSTOM = 0, + PDB = 1, + ASCII_RESTART = 2, + BINARY_RESTART = 3, + GF_N = 4, +}; + +enum charge_method +{ + QEQ_CM = 0, + EE_CM = 1, + ACKS2_CM = 2, +}; + +enum solver +{ + GMRES_S = 0, + GMRES_H_S = 1, + CG_S = 2, + SDM_S = 3, +}; + +enum pre_comp +{ + NONE_PC = 0, + DIAG_PC = 1, + ICHOLT_PC = 2, + ILU_PAR_PC = 3, + ILUT_PAR_PC = 4, + ILU_SUPERLU_MT_PC = 5, +}; + +enum pre_app +{ + TRI_SOLVE_PA = 0, + TRI_SOLVE_LEVEL_SCHED_PA = 1, + TRI_SOLVE_GC_PA = 2, + JACOBI_ITER_PA = 3, +}; + /* ??? */ enum exchanges { @@ -1126,8 +1159,11 @@ typedef struct int N; /* num. atoms within simulation */ int bigN; + /* dimension of sparse charge method matrix */ + int N_cm; /* num. hydrogen atoms */ int numH; + /* num. hydrogen atoms (GPU) */ int *d_numH; /**/ int local_cap; @@ -1274,8 +1310,9 @@ typedef struct /* format of geometry input file */ int geo_format; /* format of restart file */ - int restart; + int restart; + /**/ int restrict_bonds; /* flag to control if center of mass velocity is removed */ int remove_CoM_vel; @@ -1313,18 +1350,40 @@ typedef struct /* flag to control if force computations are tablulated */ int tabulate; + /**/ + unsigned int charge_method; /* frequency (in terms of simulation time steps) at which to * re-compute atomic charge distribution */ int charge_freq; + /**/ + unsigned int cm_solver_type; + /**/ + real cm_q_net; + /**/ + unsigned int cm_solver_max_iters; + /**/ + unsigned int cm_solver_restart; /* error tolerance of solution produced by charge distribution * sparse iterative linear solver */ - real q_err; + real cm_solver_q_err; + /**/ + real cm_domain_sparsity; + /**/ + unsigned int cm_domain_sparsify_enabled; + /**/ + unsigned int cm_solver_pre_comp_type; /* frequency (in terms of simulation time steps) at which to recompute * incomplete factorizations */ - int refactor; + unsigned int cm_solver_pre_comp_refactor; /* drop tolerance of incomplete factorization schemes (ILUT, ICHOLT, etc.) * used for preconditioning the iterative linear solver used in charge distribution */ - real droptol; + real cm_solver_pre_comp_droptol; + /**/ + unsigned int cm_solver_pre_comp_sweeps; + /**/ + unsigned int cm_solver_pre_app_type; + /**/ + unsigned int cm_solver_pre_app_jacobi_iters; /* initial temperature of simulation, in Kelvin */ real T_init; @@ -1860,8 +1919,6 @@ typedef struct rvec *dDeltap_self; /**/ int *bond_mark; - /**/ - int *done_after; /* charge matrix storage */ /* charge matrix */ diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c index af27e1cb04c9061f3bb52cb4a290cc999c93deac..14560661f57d23918774095a4df4c2d3a41fae4b 100644 --- a/PG-PuReMD/src/traj.c +++ b/PG-PuReMD/src/traj.c @@ -273,7 +273,7 @@ int Write_Header( reax_system *system, control_params *control, control->thb_cut ); strncat( out_control->buffer, out_control->line, HEADER_LINE_LEN + 1 ); - sprintf( out_control->line, SCI_LINE, "QEq_tolerance:", control->q_err ); + sprintf( out_control->line, SCI_LINE, "QEq_tolerance:", control->cm_solver_q_err ); strncat( out_control->buffer, out_control->line, HEADER_LINE_LEN + 1 ); /* temperature controls */ diff --git a/environ/parallel_control b/environ/parallel_control index 3b608f1b0ddccdaf224dd254b5266ee33a148c82..4f6d75b2e1a4f572dc3e8dc03cc63eeb4896780c 100644 --- a/environ/parallel_control +++ b/environ/parallel_control @@ -18,8 +18,20 @@ bond_graph_cutoff 0.3 ! bond strength cutoff for bond thb_cutoff 0.001 ! cutoff value for three body interactions (Angstroms) hbond_cutoff 7.50 ! cutoff distance for hydrogen bond interactions (Angstroms) -charge_freq 1 ! frequency (sim step) at which atomic charges are computed -q_err 1e-6 ! norm of the relative residual in QEq solve +charge_method 0 ! charge method: 0 = QEq, 1 = EEM, 2 = ACKS2 +charge_freq 1 ! frequency (sim step) at which atomic charges are computed +cm_q_net 0.0 ! net system charge +cm_solver_type 2 ! iterative linear solver for charge method: 0 = GMRES, 1 = GMRES_H, 2 = CG, 3 = SDM +cm_solver_max_iters 1000 ! max solver iterations +cm_solver_restart 100 ! inner iterations of GMRES before restarting +cm_solver_q_err 1e-6 ! relative residual norm threshold used in solver +cm_domain_sparsity 1.0 ! scalar for scaling cut-off distance, used to sparsify charge matrix (between 0.0 and 1.0) +cm_solver_pre_comp_type 1 ! method used to compute preconditioner, if applicable +cm_solver_pre_comp_refactor 1000 ! number of steps before recomputing preconditioner +cm_solver_pre_comp_droptol 0.0 ! threshold tolerance for dropping values in preconditioner computation, if applicable +cm_solver_pre_comp_sweeps 3 ! number of sweeps used to compute preconditioner (ILU_PAR) +cm_solver_pre_app_type 1 ! method used to apply preconditioner +cm_solver_pre_app_jacobi_iters 50 ! number of Jacobi iterations used for applying precondition, if applicable temp_init 0.01 ! desired initial temperature of the simulated system temp_final 300.0 ! desired final temperature of the simulated system diff --git a/sPuReMD/src/lin_alg.c b/sPuReMD/src/lin_alg.c index 28b7c9560d4ab5f11e8161f8a9ff867ab5f3dc1a..208791b8c297d0083489ca9faf837a746bdc0411 100644 --- a/sPuReMD/src/lin_alg.c +++ b/sPuReMD/src/lin_alg.c @@ -45,6 +45,7 @@ unsigned int *top = NULL; unsigned int *color = NULL; unsigned int *to_color = NULL; unsigned int *conflict = NULL; +unsigned int *conflict_cnt = NULL; unsigned int *temp_ptr; unsigned int *recolor = NULL; unsigned int recolor_cnt; @@ -530,8 +531,16 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) { #define MAX_COLOR (500) int i, pj, v; - unsigned int temp; - int *fb_color; + unsigned int temp, recolor_cnt_local, *conflict_local; + int tid, num_thread, *fb_color; + +#ifdef _OPENMP + tid = omp_get_thread_num(); + num_thread = omp_get_num_threads(); +#else + tid = 0; + num_thread = 1; +#endif #pragma omp master { @@ -558,7 +567,8 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } } - if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL ) + if ( (fb_color = (int*) malloc(sizeof(int) * MAX_COLOR)) == NULL || + (conflict_local = (unsigned int*) malloc(sizeof(unsigned int) * A->n)) == NULL ) { fprintf( stderr, "not enough memory for graph coloring. terminating.\n" ); exit( INSUFFICIENT_MEMORY ); @@ -594,29 +604,63 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) } /* determine if recoloring required */ - //TODO: switch to reduction on recolor_cnt (+) via parallel scan through recolor + temp = recolor_cnt; + recolor_cnt_local = 0; + + #pragma omp barrier + #pragma omp master { - temp = recolor_cnt; recolor_cnt = 0; + } + + #pragma omp barrier - for ( i = 0; i < temp; ++i ) - { - v = to_color[i]; + #pragma omp for schedule(static) + for ( i = 0; i < temp; ++i ) + { + v = to_color[i]; - /* search for color conflicts with adjacent vertices */ - for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj ) + /* search for color conflicts with adjacent vertices */ + for ( pj = A->start[v]; pj < A->start[v + 1]; ++pj ) + { + if ( color[v] == color[A->j[pj]] && v > A->j[pj] ) { - if ( color[v] == color[A->j[pj]] && v > A->j[pj] ) - { - conflict[recolor_cnt] = v; - color[v] = 0; - ++recolor_cnt; - break; - } + conflict_local[recolor_cnt_local] = v; + ++recolor_cnt_local; + break; } } + } + + /* count thread-local conflicts and compute offsets for copying into shared buffer */ + conflict_cnt[tid + 1] = recolor_cnt_local; + + #pragma omp barrier + + #pragma omp master + { + conflict_cnt[0] = 0; + for ( i = 1; i < num_thread + 1; ++i ) + { + conflict_cnt[i] += conflict_cnt[i - 1]; + } + recolor_cnt = conflict_cnt[num_thread]; + } + + #pragma omp barrier + /* copy thread-local conflicts into shared buffer */ + for ( i = 0; i < recolor_cnt_local; ++i ) + { + conflict[conflict_cnt[tid] + i] = conflict_local[i]; + color[conflict_local[i]] = 0; + } + + #pragma omp barrier + + #pragma omp master + { temp_ptr = to_color; to_color = conflict; conflict = temp_ptr; @@ -625,16 +669,9 @@ void graph_coloring( const sparse_matrix * const A, const TRIANGULARITY tri ) #pragma omp barrier } + free( conflict_local ); free( fb_color ); -//#if defined(DEBUG) -// #pragma omp master -// { -// for ( i = 0; i < A->n; ++i ) -// printf("Vertex: %5d, Color: %5d\n", i, color[i] ); -// } -//#endif - #pragma omp barrier } } @@ -873,12 +910,24 @@ void permute_matrix( sparse_matrix * const LU, const TRIANGULARITY tri ) */ sparse_matrix * setup_graph_coloring( sparse_matrix * const H ) { + int num_thread; + if ( color == NULL ) { +#ifdef _OPENMP + #pragma omp parallel + { + num_thread = omp_get_num_threads(); + } +#else + num_thread = 1; +#endif + /* internal storage for graph coloring (global to facilitate simultaneous access to OpenMP threads) */ if ( (color = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (to_color =(unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (conflict = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || + (conflict_cnt = (unsigned int*) malloc(sizeof(unsigned int) * (num_thread + 1))) == NULL || (recolor = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || (color_top = (unsigned int*) malloc(sizeof(unsigned int) * (H->n + 1))) == NULL || (permuted_row_col = (unsigned int*) malloc(sizeof(unsigned int) * H->n)) == NULL || diff --git a/sPuReMD/src/print_utils.c b/sPuReMD/src/print_utils.c index 9ff0f571be830e29591aaeee2695b8f441077fda..2f14e38ff616e6bcbf40f0de61dab90712cda7a8 100644 --- a/sPuReMD/src/print_utils.c +++ b/sPuReMD/src/print_utils.c @@ -584,7 +584,7 @@ void Output_Results( reax_system *system, control_params *control, e_pol = 0.0; #ifdef _OPENMP - #pragma omp parallel for default(none) private(q, type_i,) shared(system) \ + #pragma omp parallel for default(none) private(q, type_i) shared(system) \ reduction(+: e_pol) schedule(static) #endif for ( i = 0; i < system->N; i++ )